From cbfa4b006be27d6806cec21acb03e85d7fb34534 Mon Sep 17 00:00:00 2001 From: TacoRocket Date: Wed, 8 Apr 2026 21:03:48 -0500 Subject: [PATCH] phase4 live pass catch-up --- CHANGELOG.md | 38 +- README.md | 76 ++-- VERSION | 2 +- docs/activity-log-bundles.md | 74 ++++ docs/live-run-strategy.md | 73 +++ ...ase2-secrets-config-resource-checkpoint.md | 127 ------ .../phase3-compute-apps-network-checkpoint.md | 248 ----------- docs/phase4-command-discovery-checkpoint.md | 73 --- docs/release-process.md | 26 +- docs/release-readiness-checklist.md | 10 +- main.tf | 13 + outputs.tf | 114 ++--- scripts/export_activity_log_bundle.py | 382 ++++++++++++++++ scripts/validate_azurefox_lab.py | 417 ++++++++++++------ 14 files changed, 999 insertions(+), 674 deletions(-) create mode 100644 docs/activity-log-bundles.md create mode 100644 docs/live-run-strategy.md delete mode 100644 docs/phase2-secrets-config-resource-checkpoint.md delete mode 100644 docs/phase3-compute-apps-network-checkpoint.md delete mode 100644 docs/phase4-command-discovery-checkpoint.md create mode 100644 scripts/export_activity_log_bundle.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a39899..79565d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,16 +10,48 @@ stored in `VERSION`. ## [Unreleased] -No unreleased entries yet. +### Changed + +- retired the removed AzureFox `all-checks` path from the lab validator and current operator docs + so live runs no longer assume a grouped command that AzureFox `main` rejects as of April 8, 2026 +- documented an explicit live-run strategy for known slow paths, including Key Vault soft-delete + waits during OpenTofu apply and `role-trusts` latency during AzureFox validation +- added a lab-side Azure Activity Log bundle script and doc so SOC-style local log pulls can be + packaged with run-window phase markers without introducing a separate Azure logging backend +- added validator-emitted `command-timeline.json` artifacts plus bundle-export support so SOC + analysts can line AzureFox command start and finish times up with raw Azure Activity Log windows + +## [1.2.0] - 2026-04-05 + +### Added + +- one Azure Automation account with a system-assigned identity so the lab now owns a deterministic + Phase 4 `automation` proof surface instead of only validating zero-account execution + +### Changed + +- promoted `automation`, `devops`, `lighthouse`, and `cross-tenant` into the standalone validator + path with truth-preserving checks that distinguish deterministic proof from tenant-shaped or + external-config-shaped command behavior +- promoted `vmss` into the standalone validator path because the current lab already deploys a + stable internal VM scale set that AzureFox can read deterministically +- expanded `all-checks` validator coverage to include the current `storage` section and the newer + `lighthouse`, `automation`, `devops`, `vmss`, and `snapshots-disks` command membership reflected + in the main AzureFox repo +- updated repo docs and release language to target AzureFox `1.2.0` / Phase 4 parity instead of + leaving Phase 4 described as mostly discovery-only work +- tightened the documented truth boundary for external or tenant-shaped surfaces: + `cross-tenant` and `lighthouse` stay evidence-led, while `devops` requires a real Azure DevOps + organization for pipeline proof and otherwise should surface the expected configuration issue ## [1.1.0] - 2026-04-05 ### Added - Phase 3.5 checkpoint note for the AzureFox `1.1.0` release boundary: - `docs/phase3-compute-apps-network-checkpoint.md` + [phase3-compute-apps-network-checkpoint-2026-04-08.md](/Users/cfarley/Documents/AzureFox-reference/terraform-labs/phase3-compute-apps-network-checkpoint-2026-04-08.md) - Phase 4 live-capture note for the AzureFox `1.2.0` command lane: - `docs/phase4-command-discovery-checkpoint.md` + [phase4-command-discovery-checkpoint-2026-04-08.md](/Users/cfarley/Documents/AzureFox-reference/terraform-labs/phase4-command-discovery-checkpoint-2026-04-08.md) ### Changed diff --git a/README.md b/README.md index 6425724..8544b73 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ command implementation, and release source of truth. ## AzureFox Coverage -The lab is built to exercise this release-gated subset of AzureFox commands and sections. AzureFox +The lab is built to exercise this release-gated subset of AzureFox standalone commands. AzureFox may have additional commands on `main` that are still discovery-only or not yet backed by deterministic lab proof objects. @@ -38,6 +38,8 @@ Current validator coverage: - `whoami` - `inventory` +- `automation` +- `devops` - `arm-deployments` - `env-vars` - `tokens-credentials` @@ -46,12 +48,15 @@ Current validator coverage: - `permissions` - `privesc` - `role-trusts` +- `lighthouse` +- `cross-tenant` - `resource-trusts` - `auth-policies` - `managed-identities` - `keyvault` - `storage` - `vms` +- `vmss` - `nics` - `dns` - `endpoints` @@ -64,28 +69,33 @@ Current validator coverage: - `acr` - `databases` - `snapshots-disks` -- `all-checks --section identity` -- `all-checks --section network` -- `all-checks --section compute` -- `all-checks --section config` -- `all-checks --section secrets` -- `all-checks --section resource` + +Optional grouped follow-up: + +- `chains credential-path` +- `chains deployment-path` +- `chains escalation-path` The project is OpenTofu-first, but the HCL stays close to standard Terraform style so it feels familiar to most operators. Current checkpoint notes: -- `docs/phase3-compute-apps-network-checkpoint.md` -- `docs/phase4-command-discovery-checkpoint.md` +- `docs/activity-log-bundles.md` +- `docs/live-run-strategy.md` +- historical checkpoint notes now live under: + [phase2-secrets-config-resource-checkpoint-2026-04-08.md](/Users/cfarley/Documents/AzureFox-reference/terraform-labs/phase2-secrets-config-resource-checkpoint-2026-04-08.md) +- historical checkpoint notes now live under: + [phase3-compute-apps-network-checkpoint-2026-04-08.md](/Users/cfarley/Documents/AzureFox-reference/terraform-labs/phase3-compute-apps-network-checkpoint-2026-04-08.md) +- historical checkpoint notes now live under: + [phase4-command-discovery-checkpoint-2026-04-08.md](/Users/cfarley/Documents/AzureFox-reference/terraform-labs/phase4-command-discovery-checkpoint-2026-04-08.md) Current release boundary: -- this repo is now aligned to AzureFox `1.1.0` / Phase 3.5 for release-gated validation -- Phase 4 / `1.2.0` remains discovery-first here, except for `snapshots-disks`, which is now a - validator-backed proof surface -- broader PostgreSQL relational parity is still tracked as an AzureFox main-repo follow-up rather - than being overstated in this lab release +- this repo now targets AzureFox `1.2.0` / Phase 4 as the current parity boundary +- deterministic lab-backed proof now includes `snapshots-disks`, `vmss`, and one Automation account +- `lighthouse` and `cross-tenant` are validated as evidence-led tenant surfaces rather than fixed row-count proof +- `devops` is validated conditionally: without `AZUREFOX_DEVOPS_ORG`, the validator expects the truthful missing-organization issue instead of pretending pipeline coverage exists ## Lab Shape @@ -115,6 +125,7 @@ Current release boundary: - One AKS cluster with a public control-plane endpoint and system-assigned identity - One Azure Container Registry with public network access and admin user enabled - One Azure SQL server with one user database +- One Azure Automation account with a system-assigned identity - One public DNS zone plus one private DNS zone with a registration-enabled VNet link - Three deployment-history objects: one succeeded subscription deployment with linked template URI @@ -139,6 +150,7 @@ With this setup, AzureFox should surface: - Key Vault public-network, private-endpoint, and purge-protection posture from `keyvault` - public storage and open firewall findings from `storage` - a public VM with an attached identity from `vms` +- one internal VM scale set footprint from `vmss` - NIC attachment and public-IP references from `nics` - public IP and Azure-managed hostname visibility from `endpoints` - NIC-backed public ingress evidence from `network-ports` @@ -149,14 +161,13 @@ With this setup, AzureFox should surface: - AKS control-plane endpoint, agent-pool count, OIDC posture, and addon visibility from `aks` - ACR login-server, admin-user, webhook, replication, and policy posture from `acr` - Azure SQL endpoint, visible user-database inventory, and minimal TLS posture from `databases` +- Azure Automation account identity and zero-object execution posture from `automation` - managed-disk attachment, network-access, and encryption posture from `snapshots-disks` +- delegated-management evidence from `lighthouse` when the subscription exposes it +- outside-tenant trust evidence from `cross-tenant` without turning ambient tenant posture into a deterministic row-count claim +- Azure DevOps pipeline evidence from `devops` only when a real organization is configured - DNS zone inventory and private-endpoint-backed namespace usage from `dns` -- identity checkpoint orchestration artifacts from `all-checks --section identity` -- network checkpoint orchestration artifacts from `all-checks --section network` -- compute checkpoint orchestration artifacts from `all-checks --section compute` -- config checkpoint orchestration artifacts from `all-checks --section config` -- secrets checkpoint orchestration artifacts from `all-checks --section secrets` -- resource checkpoint orchestration artifacts from `all-checks --section resource` +- optional grouped follow-up through AzureFox `chains` families when you want a higher-level review path in addition to the standalone proof artifacts `auth-policies` is handled a little differently in this repo: @@ -301,8 +312,15 @@ By default the validator: - executes AzureFox from `--azurefox-dir` - runs in `--mode full`, which executes the current release-gated standalone AzureFox command set - prints progress lines before and after each AzureFox step, including elapsed time and target artifact directories +- records per-command UTC start and finish timestamps plus elapsed duration in `command-timeline.json` - stores proof artifacts under `proof-artifacts/latest` +For richer `devops` proof, point AzureFox at a real Azure DevOps organization before you run the validator: + +```bash +export AZUREFOX_DEVOPS_ORG= +``` + Optional flags: ```bash @@ -315,7 +333,6 @@ Useful scoped reruns: ```bash python3 scripts/validate_azurefox_lab.py --mode commands-only -python3 scripts/validate_azurefox_lab.py --mode all-checks-only python3 scripts/validate_azurefox_lab.py --mode full python3 scripts/validate_azurefox_lab.py --mode full --skip-command role-trusts ``` @@ -323,31 +340,37 @@ python3 scripts/validate_azurefox_lab.py --mode full --skip-command role-trusts Runtime notes: - use `--mode full` as the single end-to-end validation run -- `--mode full` no longer bundles `all-checks`; run `--mode all-checks-only` separately only when you intentionally want wrapper coverage - `commands-only` is now just an explicit standalone-only rerun alias for the same command family as `full` - if the live lab is already up and you only changed outputs or validator expectations, refresh the OpenTofu state before rerunning validation so stale `validation_manifest` data does not cause a false mismatch - use `--mode commands-only` when you want the individual command outputs without the orchestration pass -- use `--mode all-checks-only` only when you are specifically validating the section wrapper and artifact emission path in isolation -- do not treat `all-checks-only` as part of the default release-validation sequence unless we explicitly decide the wrapper coverage is required - `role-trusts` can take several minutes because the Azure API path is slow; the validator now emits periodic wait lines during that step instead of appearing hung - after `role-trusts` has been validated once for the current phase, reruns can use `--skip-command role-trusts` unless you changed that slice or hit a blocker that points back to it +- Key Vault replacement during `tofu apply` can spend several minutes in Azure soft-delete before recreate completes; treat that as a known slow path rather than a surprise hang - more generally, do not rerun a known slow validation path by default; only pay that cost again when the changed slice touches it, a live blocker points back to it, or the team explicitly wants the extra proof +- use [docs/live-run-strategy.md](/Users/cfarley/Documents/Terraform Labs for AzureFox/docs/live-run-strategy.md) as the standing rule set for full passes versus fast reruns Artifacts include: - one JSON payload per AzureFox command - copied loot files emitted by AzureFox -- `all-checks --section
` output plus `run-summary.json` for `identity`, `network`, `compute`, `config`, `secrets`, and `resource` +- `command-timeline.json` - `summary.json` - `summary.txt` - `azurefox-mismatch-report.md` - `identity-mismatch-report.md` - `azurefox-follow-up-items.md` +Optional SOC / detection artifact flow: + +- `command-timeline.json` now records UTC start and finish markers plus duration for each AzureFox validation command so analysts can correlate those markers against Azure control-plane activity +- this timestamp artifact only covers the validator command lane; keep recording `apply` and `destroy` timestamps separately when you want the full lab window in the bundle +- use [docs/activity-log-bundles.md](/Users/cfarley/Documents/Terraform Labs for AzureFox/docs/activity-log-bundles.md) to pull Azure Activity Log locally for the full lab window and package it with phase markers plus validator command markers +- the bundle script is [export_activity_log_bundle.py](/Users/cfarley/Documents/Terraform Labs for AzureFox/scripts/export_activity_log_bundle.py) + ## Evidence Boundaries This lab is here to validate AzureFox output against real Azure objects. It is not a substitute for @@ -364,8 +387,9 @@ What AzureFox can verify directly from read-only control-plane and Graph data: - that managed-identity token surfaces correlate across web workloads, VMs, and deployment history - that Azure-managed App Service and Function App hostnames are visible control-plane endpoint paths, not proven live ingress - that NIC-backed public ingress evidence comes from visible NSG allow rules rather than guessed reachability -- that storage, API Management, AKS, ACR, and Azure SQL depth stays evidence-based when only management metadata is visible +- that storage, VMSS, Automation, API Management, AKS, ACR, and Azure SQL depth stays evidence-based when only management metadata is visible - that the current DNS boundary stays at zone inventory and private-endpoint-backed namespace usage rather than record export or live resolution proof +- that `lighthouse`, `cross-tenant`, and `devops` stay honest about external prerequisites, tenant shape, or partial-read boundaries What only the lab can confirm once infrastructure exists and the validator has been run: diff --git a/VERSION b/VERSION index 9084fa2..26aaba0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.0 +1.2.0 diff --git a/docs/activity-log-bundles.md b/docs/activity-log-bundles.md new file mode 100644 index 0000000..70851e5 --- /dev/null +++ b/docs/activity-log-bundles.md @@ -0,0 +1,74 @@ +# Activity Log Bundles + +Use this when you want a local Azure Activity Log bundle for SOC or detection work without sending +anything into a separate Azure logging backend. + +The bundle script lives at: + +- `scripts/export_activity_log_bundle.py` + +It writes a local bundle directory with: + +- `run-window.json` +- `metadata.json` +- `timeline.md` +- `azure-activity-log.json` +- `command-timeline.json` if you pass the validator artifact into the exporter +- `.zip` unless you pass `--no-zip` + +## Minimal Flow + +Record UTC timestamps for the infrastructure phases as you move through the lab run: + +```bash +START_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +APPLY_START_UTC="$START_UTC" +tofu apply +APPLY_END_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +VALIDATE_START_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +python3 scripts/validate_azurefox_lab.py --mode full +VALIDATE_END_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +DESTROY_START_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +tofu destroy +DESTROY_END_UTC="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +``` + +The validator now writes `proof-artifacts/latest/command-timeline.json` automatically. That file +captures per-command UTC start and finish markers plus elapsed duration for the AzureFox command +lane, so you no longer need to hand-build those validator markers yourself. + +Then export the bundle: + +```bash +python3 scripts/export_activity_log_bundle.py \ + --run-id live-20260408 \ + --start-time "$START_UTC" \ + --end-time "$DESTROY_END_UTC" \ + --command-timeline-file proof-artifacts/latest/command-timeline.json \ + --phase apply_start="$APPLY_START_UTC" \ + --phase apply_end="$APPLY_END_UTC" \ + --phase validate_start="$VALIDATE_START_UTC" \ + --phase validate_end="$VALIDATE_END_UTC" \ + --phase destroy_start="$DESTROY_START_UTC" \ + --phase destroy_end="$DESTROY_END_UTC" +``` + +By default this writes to: + +- `proof-artifacts/activity-log//` +- `proof-artifacts/activity-log/.zip` + +## Notes + +- This is local pull-only. It does not require Log Analytics, Event Hub, or Storage export. +- The script uses `az monitor activity-log list`, so it needs a working Azure CLI login. +- The default `--max-events` is `5000`. Raise it if you expect a busier subscription window. +- `scripts/validate_azurefox_lab.py` now writes `proof-artifacts/latest/command-timeline.json` + automatically so you can hand the exporter a per-command start and finish timeline from the same run. +- `command-timeline.json` only covers the AzureFox validator commands. You still need manual + `date -u` markers or another wrapper for `tofu apply`, `tofu destroy`, and any other external + steps you want represented in the bundle window. +- The timeline file is there to help analysts line phase markers and AzureFox command markers up + with Azure event timestamps and operation names. diff --git a/docs/live-run-strategy.md b/docs/live-run-strategy.md new file mode 100644 index 0000000..5304f2a --- /dev/null +++ b/docs/live-run-strategy.md @@ -0,0 +1,73 @@ +# Live Run Strategy + +Use this when you are running the lab against a real Azure subscription and want a practical rule +set instead of rediscovering the same slow paths each time. + +## Known Slow Paths + +- `azurerm_key_vault.open` replacement is a normal Azure slow path. +- Azure Key Vault deletes move through soft-delete before the name is fully reusable, so OpenTofu + can spend several minutes waiting even after the vault disappears from the active subscription + view. +- `role-trusts` is a known slow Azure API path during AzureFox validation and can take several + minutes before JSON returns. + +## Release-Candidate Pass + +Pay the full slow-path cost once when the goal is a fresh release-candidate baseline: + +```bash +tofu init +tofu plan +tofu apply +python3 scripts/validate_azurefox_lab.py --mode full +tofu destroy +``` + +Keep `proof-artifacts/latest/command-timeline.json` from that validator pass so the same release +candidate run has per-command UTC markers if the SOC later wants the Azure Activity Log window. + +Use this full pass when: + +- the lab shape changed +- the validator expectations changed in a way that touches the slow slices +- AzureFox changed its output for `keyvault` or `role-trusts` +- you need a new clean baseline for release notes or proof artifacts + +## Fast Rerun Strategy + +Do not pay every slow-path cost on every rerun. + +If the infrastructure is already up and the change did not touch the lab shape: + +```bash +tofu apply -refresh-only +python3 scripts/validate_azurefox_lab.py --mode full --skip-command role-trusts +``` + +Keep the refreshed run's `command-timeline.json` too. It is the quickest way to tell the difference +between a known slow validator command and a run that truly drifted or hung. + +Use this faster rerun when: + +- you changed validator wording, docs, or artifact handling +- you are rechecking a live mismatch that does not point back to `role-trusts` +- the current state only needs a manifest refresh before validation + +## Operational Rules + +- Treat Key Vault replacement waits as expected unless they exceed the normal Azure delete/recreate + window by a wide margin. +- Treat `role-trusts` as skippable on reruns once it has already been validated for the current + phase. +- Keep the validator's `command-timeline.json` with the same proof artifacts as `summary.json` so + later SOC bundle exports can show when each AzureFox command actually ran. +- Do not keep stale `all-checks` assumptions in the live-run workflow. Current AzureFox `main` + removed that grouped command on April 8, 2026; use standalone command validation here and treat + `chains` as optional grouped follow-up instead. +- If a rerun does not need new Azure infrastructure truth, prefer refresh-only plus validator rerun + over a full destroy/reapply cycle. +- If the SOC wants a local starter pack for detections, use + [activity-log-bundles.md](/Users/cfarley/Documents/Terraform Labs for AzureFox/docs/activity-log-bundles.md) + to package Azure Activity Log plus phase markers and per-command AzureFox markers after the run + window closes. diff --git a/docs/phase2-secrets-config-resource-checkpoint.md b/docs/phase2-secrets-config-resource-checkpoint.md deleted file mode 100644 index 5350148..0000000 --- a/docs/phase2-secrets-config-resource-checkpoint.md +++ /dev/null @@ -1,127 +0,0 @@ -# Phase 2 Secrets, Config, and Resource Trust Checkpoint - -Date: 2026-03-31 - -This file records the sister-repo catch-up boundary for the AzureFox Phase 2 milestone. - -## Phase 2 Slices That Landed In AzureFox - -- `keyvault` -- `resource-trusts` -- `arm-deployments` -- `env-vars` -- `tokens-credentials` - -## What The Lab Needs To Add - -### `keyvault` - -- one Key Vault with `publicNetworkAccess=Enabled`, firewall default action `Allow`, and no - private endpoint -- one Key Vault with `publicNetworkAccess=Enabled`, firewall default action `Deny`, and no - private endpoint -- one Key Vault with `publicNetworkAccess=Enabled` and a private endpoint present -- one Key Vault with `publicNetworkAccess=Disabled` and a private endpoint present -- one vault with purge protection disabled so the `keyvault` command still exercises its - recovery-control finding without leaking that finding into `resource-trusts` - -### `resource-trusts` - -- reuse the Phase 2 storage and Key Vault objects so this command is validated through the same - composed path AzureFox now uses live -- validate: - `anonymous-blob-access` - `public-network-default` - `public-network` for Key Vault - `private-endpoint` for both Storage and Key Vault -- keep `resource-trusts` focused on trust-relevant exposure findings only - -### `arm-deployments` - -- one subscription-scope deployment with: - output values recorded - a linked template URI -- one resource-group deployment with: - output values recorded - a linked parameters URI -- one failed resource-group deployment with no outputs - -### `env-vars` - -- one App Service with a system-assigned identity and a plain-text sensitive setting -- one Function App with both system-assigned and user-assigned identity -- one Key Vault-backed app setting that uses `keyVaultReferenceIdentity` -- one workload identity-bearing web app with no meaningful app settings so `tokens-credentials` - can prove that managed-identity workload coverage no longer depends on env-var rows existing - -### `tokens-credentials` - -- validate the Phase 2 correlation outputs rather than building a separate lab-only interpretation -- expected surface families: - plain-text credential-like app settings - Key Vault-backed settings - web workload managed-identity token paths - VM IMDS token-minting paths - deployment outputs - linked deployment content - -## What AzureFox Can Prove Directly - -- Key Vault public-network posture, private endpoint presence, RBAC mode, and purge-protection - posture from management-plane metadata -- Storage public-access, firewall default action, and private endpoint posture -- deployment history metadata such as scope, state, output counts, and linked template or - parameter URIs -- App Service and Function App setting names, value classification, workload identity context, and - Key Vault reference targets -- token or credential surface correlation from readable metadata across workloads, deployments, and - existing VM identity context - -## What Only The Lab Can Confirm - -- the intended Key Vault and deployment proof objects exist live and AzureFox surfaces them with - the same narrow wording used in fixtures -- `tokens-credentials` still reports identity-bearing web workloads even when app settings are - empty or otherwise absent from the payload -- the composed `resource-trusts` path stays aligned with the underlying `storage` and `keyvault` - outputs in a real subscription -- table-mode wording still stays operator-first once real artifact sets are generated - -## What This Phase Still Does Not Prove - -- actual Key Vault secret value readability -- live IMDS or managed-identity token exchange -- secret retrieval through a running workload -- deployment output sensitivity beyond the fact that output values were recorded -- private endpoint end-to-end reachability proof - -## Validator / Manifest Follow-Up - -- extend `validation_manifest` with a Phase 2 checkpoint section -- include explicit Phase 2 command coverage for: - `keyvault` - `resource-trusts` - `arm-deployments` - `env-vars` - `tokens-credentials` -- add validation for: - `all-checks --section config` - `all-checks --section secrets` - `all-checks --section resource` -- add assertions that: - `tokens-credentials` includes a managed-identity web workload even when no env-var rows carry - that workload - `tokens-credentials` finding ids remain unique per surface - `resource-trusts` includes the `kvlabdeny01` and `kvlabhybrid01` style Key Vault trust rows - `resource-trusts` does not emit the Key Vault purge-protection finding - -## Recommended Sister-Repo Next Actions - -- add the Phase 2 Key Vault objects first because both `keyvault` and `resource-trusts` depend on - them -- add deployment-history proof objects next -- add App Service / Function App config proof objects, including the identity-bearing empty-settings - workload -- extend `validation_manifest` -- extend `validate_azurefox_lab.py` -- add a Phase 2 proof-artifact summary alongside the existing Phase 1 checkpoint outputs diff --git a/docs/phase3-compute-apps-network-checkpoint.md b/docs/phase3-compute-apps-network-checkpoint.md deleted file mode 100644 index 92881f2..0000000 --- a/docs/phase3-compute-apps-network-checkpoint.md +++ /dev/null @@ -1,248 +0,0 @@ -# Phase 3.5 Compute, Apps, Network, and DNS Checkpoint - -Date: 2026-04-05 - -This file records the sister-repo catch-up boundary for the AzureFox `v1.1.0` Phase 3.5 release. - -The lab already proves the original end-of-Phase-3 breadth. The current catch-up target is the -Phase 3.5 follow-on depth that AzureFox shipped inside: - -- `storage` -- `dns` -- `api-mgmt` -- `aks` -- `acr` -- `databases` - -This checkpoint is intentionally narrower than current AzureFox `main`. - -- Immediate parity target: released AzureFox `v1.1.0` -- Do not block this checkpoint on later Phase 4 and `1.2.0` work -- Separate live-capture note for the current Phase 4 command lane: - `docs/phase4-command-discovery-checkpoint.md` - -## Catch-Up Execution Lanes - -Keep the lab work split by whether live Azure is actually needed. - -### No-Azure Lane - -- compare the shipped AzureFox command depth against the current lab manifest and validator -- restate the proof target in repo docs before changing release alignment -- queue release/version edits separately so minor doc or validator work does not force an Azure run - -### Azure Discovery Lane - -- deploy the current lab shape once and capture the current AzureFox `v1.1.0`-boundary evidence -- answer one question first: which grounded depth cues already exist in the current lab without new - OpenTofu objects? - -### No-Azure Implementation Lane - -- update `validation_manifest`, validator assertions, and checkpoint wording for every depth cue the - current live lab already proves - -### Azure Gap Lane - -- only add new OpenTofu objects when the discovery pass shows a real parity gap -- keep those changes isolated so infra-required work does not block unrelated repo maintenance - -### Azure Final-Proof Lane - -- rerun the deployed lab against AzureFox once the catch-up slice is implemented -- use that final run for proof artifacts and release readiness, not for exploratory discovery - -## Phase 3 Slices That Landed In AzureFox - -- `nics` -- `endpoints` -- `network-ports` -- `workloads` -- `app-services` -- `functions` -- `api-mgmt` -- `aks` -- `acr` -- `databases` -- `dns` - -## Phase 3.5 Target Boundary - -The `v1.1.0` catch-up extends the original Phase 3 boundary into Phase 3.5 in these specific ways. - -### `storage` - -Keep the existing two-account proof shape and validate the grounded management-plane depth AzureFox -now ships: - -- `public_network_access` -- `allow_shared_key_access` -- transport hardening such as `https_only` and `min_tls_version` -- service-shape cues such as `is_hns_enabled`, `is_sftp_enabled`, and other readable endpoint or - protocol posture - -Do not turn this into blob, container, queue, or file-share enumeration. - -### `dns` - -Keep DNS at namespace-usage depth rather than record analysis. - -- preserve public-zone name-server and record-count proof -- preserve private-zone virtual-network link and registration-link proof -- add private-endpoint-backed namespace cues such as `private_endpoint_reference_count` when the - lab exposes them - -Do not turn this into record export, live resolution testing, or takeover heuristics. - -### `api-mgmt` - -Extend APIM proof beyond the basic census: - -- subscription counts and active-state cues -- API subscription-required counts -- named-value total, secret-marked, and Key Vault-backed counts -- backend destination host visibility - -Do not treat this as proof of backend reachability or secret retrieval. - -### `aks` - -Extend AKS proof with Azure-side cluster depth: - -- `oidc_issuer_enabled` -- `workload_identity_enabled` -- readable addon and ingress-profile cues such as `addon_names` - -Do not cross into kubeconfig, pod, service, or other in-cluster collection. - -### `acr` - -Extend ACR proof with automation and governance depth: - -- webhook counts, enabled-webhook counts, and broad-scope cues -- replication counts and region context -- quarantine, retention, and trust-policy posture when readable - -Do not widen this into repository, tag, or image enumeration. - -### `databases` - -Keep Azure SQL proof in place, but treat grounded parity as cross-engine relational triage: - -- Azure SQL remains part of the proof base -- PostgreSQL Flexible Server and MySQL Flexible Server should be included if the live lab shape - actually exposes them -- if the current lab does not deploy those engines yet, record that honestly as a gap to close in a - separate infra slice rather than claiming full parity early - -## What The Lab Needs To Add - -### Shared Network / Workload Proof - -- keep the existing public VM, VMSS, App Services, and Function App as the deployed workload base -- add one explicit NSG allow rule on the workload subnet so `network-ports` has narrow, readable ingress evidence for the public VM -- validate: - `nics` attachment plus public-IP reference - `endpoints` public IP plus Azure-managed hostname visibility - `network-ports` NSG-backed public port evidence - `workloads` joined VM plus web census without overstating current VMSS coverage - -### Web / App Proof - -- reuse the existing App Service and Function App proof workloads from Phase 2 -- make public-network and TLS posture explicit in OpenTofu so `app-services` and `functions` stay deterministic -- keep Azure-managed hostname output evidence-based rather than treating it as proven live ingress - -### Service-Specific Resource Proof - -- one API Management service with management-plane inventory counts, subscription cues, named-value - depth, and backend-host visibility -- one AKS cluster with a visible control-plane FQDN, cluster identity, and readable OIDC, - workload-identity, or addon cues -- one ACR registry with a visible login server, public auth posture, and readable webhook, - replication, and policy cues -- one Azure SQL server with at least one visible user database -- separate relational-engine proof only if live discovery shows PostgreSQL or MySQL parity needs new - OpenTofu objects - -### DNS Phase 3.5 Proof - -- one public DNS zone with visible Azure name servers -- one private DNS zone with a registration-enabled virtual-network link -- keep DNS proof at zone and namespace-usage metadata only: - record-set totals from zone metadata - public-zone delegation count - private-zone linked-VNet and registration-link counts - private-endpoint reference counts when the zone-group path is readable - -## What AzureFox Can Prove Directly - -- NIC attachment, IP context, and public-IP references from control-plane network metadata -- public IP visibility and Azure-managed default hostnames as visible endpoint paths -- NSG allow-rule evidence for NIC-backed public exposure -- joined workload identity and endpoint context across compute and web assets -- App Service and Function App runtime, hostname, identity, and posture metadata -- API Management, AKS, ACR, and relational-database inventory and posture metadata -- DNS zone inventory, public delegation counts, private-zone VNet-link counts, and private-endpoint - reference counts when readable - -## What Only The Lab Can Confirm - -- the intended Phase 3 proof objects exist live and AzureFox surfaces them in the current JSON shape -- `endpoints` and `workloads` wording stays evidence-based for Azure-managed hostnames -- `network-ports` remains the stronger ingress-evidence surface for NIC-backed public exposure -- compute, network, and resource `all-checks` sections still emit stable artifact sets once run against a real tenant - -## What This Phase Still Does Not Prove - -- full effective-network reachability analysis -- actual HTTP reachability behind App Service or Function hostname publication -- AKS cluster internals beyond the visible control-plane metadata -- ACR image contents or pull success -- database query access or firewall-behavior proof -- DNS record contents, record targets, live resolution behavior, or takeover heuristics -- storage object names, data-plane ACLs, SAS material, or key retrieval - -## Validator / Manifest Follow-Up - -- extend `validation_manifest` with Phase 3.5 expectations rather than only the original - breadth checkpoint -- include explicit Phase 3 command coverage for: - `storage` - `nics` - `dns` - `endpoints` - `network-ports` - `workloads` - `app-services` - `functions` - `api-mgmt` - `aks` - `acr` - `databases` -- add validation for: - `all-checks --section network` - `all-checks --section compute` - `all-checks --section resource` -- keep the existing Phase 2 assertions in place so catch-up work does not silently regress earlier coverage - -## Live Discovery Checklist - -Use the first Azure run to answer these questions before adding infrastructure: - -- does `storage` already expose the public/private split plus shared-key, TLS, and service-shape - cues in the current lab deployment? -- does `dns` already surface private-endpoint reference counts for the current private zone? -- does `api-mgmt` already surface subscription counts, named-value secret counts, Key Vault-backed - named values, and backend hostnames from the current service shape? -- does `aks` already surface OIDC, workload-identity, or addon cues from the current cluster? -- does `acr` already surface webhook, replication, retention, and trust-policy posture from the - current registry? -- does `databases` still prove only Azure SQL, or do we need a separate infra slice for PostgreSQL - Flexible Server and MySQL Flexible Server parity? - -## Known Live-Proof Gaps To Track - -- local AzureFox checkout drift can invalidate Phase 3 validation if it is behind `main`, especially for `dns` -- sibling-repo proof should stay narrow if Azure control-plane defaults differ from the intended posture -- any live drift between manifest claims and actual Azure output should be treated as a sister-repo fix item, not silently accepted diff --git a/docs/phase4-command-discovery-checkpoint.md b/docs/phase4-command-discovery-checkpoint.md deleted file mode 100644 index 69e6599..0000000 --- a/docs/phase4-command-discovery-checkpoint.md +++ /dev/null @@ -1,73 +0,0 @@ -# Phase 4 Command Discovery Checkpoint - -Date: 2026-04-05 - -This note records the separate Phase 4 / AzureFox `v1.2.0` live capture that was taken while the -lab was already deployed for the Phase 3.5 parity run. - -Artifact root: - -- `/tmp/terraform-labs-phase4-discovery-20260405` - -Commands captured: - -- `snapshots-disks` -- `lighthouse` -- `cross-tenant` -- `automation` -- `devops` - -## What The Current Live Lab Already Shows - -### `snapshots-disks` - -- current lab exposes one attached managed OS disk for `vm-web-01` -- live output surfaced readable disk posture including: - `attachment_state=attached` - `network_access_policy=AllowAll` - `public_network_access=Enabled` - `encryption_type=EncryptionAtRestWithPlatformKey` - -### `cross-tenant` - -- current tenant produced 238 `cross_tenant_paths` -- the output includes tenant-level policy posture plus readable external service-principal paths -- one partial-read issue appeared for `auth_policies.security_defaults` with `403 Forbidden` - -### `lighthouse` - -- live command completed cleanly with zero delegations - -### `automation` - -- live command completed cleanly with zero automation accounts - -### `devops` - -- live command completed with zero pipelines because no Azure DevOps organization was configured -- issue surfaced as: - `Azure DevOps organization not configured; rerun with --devops-organization or set AZUREFOX_DEVOPS_ORG.` - -## What This Means For The Sister Repo - -- `snapshots-disks` is the cleanest current Phase 4 proof surface because the lab already deploys a - VM-backed managed disk that AzureFox can read deterministically -- that makes `snapshots-disks` the first Phase 4 command worth promoting into the sister-repo - validator boundary -- `cross-tenant` can be captured from the live tenant today, but its shape depends on tenant - posture and Graph permissions, so it should stay evidence-led rather than release-blocking until - the desired assertion boundary is defined -- `lighthouse`, `automation`, and `devops` currently prove command execution paths, not resource - depth, because the lab does not yet deploy deterministic objects for them - -## Next Promotion Rule - -Only move a Phase 4 command into the sister-repo validator boundary when at least one of these is -true: - -- the current lab already exposes a stable, deterministic proof object for that command -- or a separate infra slice deliberately adds that proof object without expanding unrelated scope - -For the current live run, keep most of Phase 4 as a captured reference lane. Promote only -`snapshots-disks` into the validator boundary for now, and leave the rest outside the Phase 3.5 -release gate until the lab owns deterministic proof for them. diff --git a/docs/release-process.md b/docs/release-process.md index 7bd2a8c..51ad25c 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -18,12 +18,11 @@ Version alignment rule: Current release boundary: -- the current lab release candidate aligns to AzureFox `1.1.0` Phase 3.5 parity +- the current lab release candidate aligns to AzureFox `1.2.0` / Phase 4 parity - the repo's `full` validator now matches that standalone release gate directly -- Phase 4 / `1.2.0` outputs can be captured during the same live run, but they remain discovery - work here unless the lab shape, validator assertions, and docs are deliberately promoted -- broader PostgreSQL relational parity remains an AzureFox main-repo fix item rather than a reason - to overclaim this lab release +- deterministic lab-backed proof now includes `snapshots-disks`, `vmss`, and one Automation account +- `lighthouse` and `cross-tenant` remain evidence-led because their truth boundary depends on live tenant posture +- `devops` remains conditional on a real Azure DevOps organization and should stay explicit about that operator prerequisite in release notes - treat this lab as a v1 artifact, not a `0.x` preview line ## Release Goals @@ -64,6 +63,8 @@ reliable, not try to hide it behind thin automation that obscures what is being python3 scripts/validate_azurefox_lab.py --mode full tofu destroy ``` + Preserve `proof-artifacts/latest/command-timeline.json` from that validation pass as part of the + release-candidate proof set. After `tofu destroy`, verify in Azure that the tagged lab footprint is actually gone before you call teardown complete: ```bash @@ -71,9 +72,14 @@ reliable, not try to hide it behind thin automation that obscures what is being az resource list --tag project=azurefox-proof-lab --query "[].{name:name,type:type,group:resourceGroup,location:location}" -o json ``` Do not rely on local destroy output alone when deciding that the subscription is clean. + Follow [live-run-strategy.md](/Users/cfarley/Documents/Terraform Labs for AzureFox/docs/live-run-strategy.md) + when deciding whether the current pass is a full release-candidate baseline or a faster rerun. If the lab is already deployed and you only changed outputs, manifest assumptions, or validator logic, run `tofu apply -refresh-only` before rerunning validation so the current `validation_manifest` output is recorded in state. + Treat Key Vault replacement as a normal Azure slow path: the vault can spend several minutes in + soft-delete before the recreate step completes, even after it disappears from the active + subscription view. If `role-trusts` has already been baseline-validated for the current phase and you did not touch that slice, reruns may use `--skip-command role-trusts` to avoid paying the known slow Azure API cost again. @@ -81,8 +87,13 @@ reliable, not try to hide it behind thin automation that obscures what is being changed slice touches that surface, a blocker points back to it, or the team explicitly agrees the extra proof is worth the runtime. 5. Review proof artifacts before release. - Check the generated `summary.json`, `summary.txt`, mismatch reports, follow-up items, and command - payloads for wording drift or unexpected live-tenant behavior. + Check the generated `summary.json`, `summary.txt`, `command-timeline.json`, mismatch reports, + follow-up items, and command payloads for wording drift or unexpected live-tenant behavior. + Make sure `command-timeline.json` covers the expected command set and that its UTC timings line + up with any runtime claims or optional Activity Log bundle you plan to hand to the SOC. + If the SOC or detection team wants the full control-plane window for independent analysis, + package a local Azure Activity Log bundle after the run using + [activity-log-bundles.md](/Users/cfarley/Documents/Terraform Labs for AzureFox/docs/activity-log-bundles.md). 6. Reconfirm quota and cost assumptions. Validate that the documented fallback SKUs and region guidance still reflect what the team actually needed for deployment. @@ -102,6 +113,7 @@ Release notes should answer: - what changed in the validator or artifact layout - what operators should watch for around subscription quotas, regions, and runtime length - how teardown was verified from Azure rather than inferred from local OpenTofu output alone +- whether the command-timeline artifact or Activity Log bundle format changed in operator-visible ways - where the workflow is intentionally manual and what judgment the operator is still expected to apply - what known gaps still remain intentionally out of scope diff --git a/docs/release-readiness-checklist.md b/docs/release-readiness-checklist.md index 7c40563..6362c6d 100644 --- a/docs/release-readiness-checklist.md +++ b/docs/release-readiness-checklist.md @@ -26,17 +26,23 @@ Use this before tagging a lab release candidate. - `validation_manifest` still matches the OpenTofu-produced lab shape. - `scripts/validate_azurefox_lab.py` still validates the intended AzureFox command set. -- `--mode full`, `--mode commands-only`, and `--mode all-checks-only` all behave as documented. +- `--mode full` and `--mode commands-only` both behave as documented. - `--mode full` remains the single end-to-end validation gate for release readiness. -- `--mode commands-only` is only an explicit standalone rerun alias, while `--mode all-checks-only` stays a separate wrapper-check path rather than part of the default release gate. +- `--mode commands-only` is only an explicit standalone rerun alias rather than a separate release gate. - proof artifacts are written deterministically enough for operator review. - mismatch and follow-up reports stay evidence-based instead of normalizing live drift. +- tenant-shaped or external-config-shaped commands stay honest about their limits: + `cross-tenant` and `lighthouse` remain evidence-led, while `devops` either uses a real Azure DevOps organization or records the expected missing-organization issue clearly. +- the docs do not claim current AzureFox still supports `all-checks`; grouped follow-up is described truthfully through `chains` instead. ## Live Run Readiness - `tofu init`, `tofu plan`, and `tofu apply` succeed in a disposable subscription. - `python3 scripts/validate_azurefox_lab.py --mode full` completes successfully against the applied environment. +- the operator followed [live-run-strategy.md](/Users/cfarley/Documents/Terraform Labs for AzureFox/docs/live-run-strategy.md) + so Key Vault replacement and `role-trusts` waits were treated as known slow paths instead of ad + hoc failures. - proof artifacts were reviewed after the live run and do not show unexplained drift. - `tofu destroy` succeeds cleanly after validation. - Azure API checks confirm that tagged lab resource groups and resources are actually gone after diff --git a/main.tf b/main.tf index fcbade1..e39ecb4 100644 --- a/main.tf +++ b/main.tf @@ -34,6 +34,7 @@ locals { phase3_sql_admin_password = "AzFox!${substr(local.unique_suffix, 0, 4)}${substr(local.unique_suffix, 4, 4)}" phase3_public_dns_zone_name = "af-${substr(local.unique_suffix, 0, 6)}.example.net" phase3_private_dns_zone_name = "azurefox-${substr(local.unique_suffix, 0, 6)}.internal" + phase4_automation_name = "aa-ops-${substr(local.unique_suffix, 0, 6)}" phase2_sub_template_hash = substr(filemd5("${path.module}/scripts/arm-templates/sub-foundation.json"), 0, 8) phase2_rg_parameters_hash = substr(filemd5("${path.module}/scripts/arm-templates/kv-secrets.parameters.json"), 0, 8) phase2_sub_template_blob_name = "templates/sub-foundation-${local.phase2_sub_template_hash}.json" @@ -84,6 +85,18 @@ resource "azurerm_resource_group" "ops" { tags = local.tags } +resource "azurerm_automation_account" "phase4" { + name = local.phase4_automation_name + location = azurerm_resource_group.ops.location + resource_group_name = azurerm_resource_group.ops.name + sku_name = "Basic" + tags = local.tags + + identity { + type = "SystemAssigned" + } +} + resource "azurerm_virtual_network" "lab" { name = "vnet-azurefox-lab" address_space = ["10.42.0.0/16"] diff --git a/outputs.tf b/outputs.tf index 242221a..45a3722 100644 --- a/outputs.tf +++ b/outputs.tf @@ -109,7 +109,6 @@ output "validation_manifest" { validation_mode = "non-invasive" } identity_checkpoint = { - all_checks_section = "identity" commands = [ "whoami", "rbac", @@ -120,13 +119,12 @@ output "validation_manifest" { "auth-policies", "managed-identities", ] + grouped_follow_up = [ + "chains credential-path", + "chains escalation-path", + ] } phase2_checkpoint = { - all_checks_sections = [ - "config", - "secrets", - "resource", - ] commands = [ "keyvault", "resource-trusts", @@ -134,6 +132,9 @@ output "validation_manifest" { "env-vars", "tokens-credentials", ] + grouped_follow_up = [ + "chains deployment-path", + ] key_vaults = { open = { expected_finding_prefix = "keyvault-public-network-open-" @@ -263,11 +264,6 @@ output "validation_manifest" { } } phase3_checkpoint = { - all_checks_sections = [ - "network", - "compute", - "resource", - ] commands = [ "storage", "nics", @@ -281,6 +277,10 @@ output "validation_manifest" { "aks", "acr", "databases", + "vmss", + ] + grouped_follow_up = [ + "chains deployment-path", ] storage = { public = { @@ -464,6 +464,17 @@ output "validation_manifest" { user_database_names = [azurerm_mssql_database.phase3.name] } } + vmss = { + api = { + identity_type = null + instance_count = 1 + name = azurerm_linux_virtual_machine_scale_set.vmss_api.name + nic_configuration_count = 1 + public_ip_configuration_count = 0 + sku_name = var.vmss_sku + subnet_id = azurerm_subnet.workload.id + } + } dns = { public_zone = { name = azurerm_dns_zone.phase3_public.name @@ -491,13 +502,42 @@ output "validation_manifest" { "Azure-managed hostnames in endpoints and workloads are visibility proof, not proven live ingress reachability.", "network-ports remains narrow NIC-backed public endpoint evidence and does not prove full effective-network reachability.", "Current DNS validation in this lab stays at namespace-usage metadata and private-endpoint reference counts because the current read path did not expose stable record totals, delegation details, or VNet-link counters.", - "The live ACR run did not consistently surface public-network or managed-identity posture even though the lab deployment enables both, so the validator avoids overclaiming those fields until the AzureFox read path is clarified.", + "If a live rerun against the current AzureFox checkout still omits ACR public-network or managed-identity posture, keep the validator conservative instead of overclaiming fields the read path does not return consistently.", ] } phase4_checkpoint = { commands = [ + "automation", + "devops", + "lighthouse", + "cross-tenant", "snapshots-disks", ] + automation = { + ops = { + certificate_count = 0 + connection_count = 0 + credential_count = 0 + encrypted_variable_count = 0 + hybrid_worker_group_count = 0 + identity_type = null + job_schedule_count = 0 + name = azurerm_automation_account.phase4.name + runbook_count = 0 + schedule_count = 0 + variable_count = 0 + webhook_count = 0 + } + } + devops = { + expect_unconfigured_issue_without_org = true + } + lighthouse = { + validation_mode = "evidence-led" + } + cross_tenant = { + validation_mode = "evidence-led" + } snapshots_disks = { vm_web_os_disk = { attached_to_name = azurerm_linux_virtual_machine.vm_web.name @@ -509,48 +549,20 @@ output "validation_manifest" { } } known_gaps = [ - "cross-tenant remains tenant- and permission-dependent, so it is useful live evidence but not yet a deterministic release-gated validator target.", - "lighthouse, automation, and devops remain discovery-only until the lab intentionally adds stable proof objects or required operator configuration.", + "cross-tenant remains tenant- and permission-dependent, so the lab keeps it evidence-led rather than row-count gated.", + "lighthouse remains subscription- and tenant-shaped; promote stronger assertions only if the lab intentionally adds delegated-management proof.", + "devops needs a real Azure DevOps organization, project, and pipeline path before it can move past conditional validation of command behavior and truthful issue surfacing.", + "automation currently validates the visible zero-object execution posture, but the current AzureFox read path did not return a managed-identity type for the lab-owned Automation account during the April 8, 2026 live pass.", ] } - all_checks_sections = { - identity = [ - "whoami", - "rbac", - "principals", - "permissions", - "privesc", - "role-trusts", - "auth-policies", - "managed-identities", - ] - config = [ - "arm-deployments", - "env-vars", - ] - secrets = [ - "keyvault", - "tokens-credentials", - ] - resource = [ - "resource-trusts", - "api-mgmt", - "acr", - "databases", - ] - network = [ - "nics", - "dns", - "endpoints", - "network-ports", - ] - compute = [ - "workloads", - "app-services", - "functions", - "aks", - "vms", + grouped_follow_up = { + command = "chains" + implemented_families = [ + "credential-path", + "deployment-path", + "escalation-path", ] + validation_mode = "optional-follow-up" } resource_groups = { network = azurerm_resource_group.network.name diff --git a/scripts/export_activity_log_bundle.py b/scripts/export_activity_log_bundle.py new file mode 100644 index 0000000..c2fad35 --- /dev/null +++ b/scripts/export_activity_log_bundle.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +import zipfile +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Pull Azure Activity Log events for a lab run window and package them locally." + ) + parser.add_argument( + "--subscription", + default=None, + help="Subscription ID to query. Defaults to the current Azure CLI subscription.", + ) + parser.add_argument( + "--run-id", + default=None, + help="Stable label for this bundle. Defaults to a UTC timestamp-based ID.", + ) + parser.add_argument( + "--start-time", + default=None, + help="Run window start in ISO 8601 / Azure CLI time format.", + ) + parser.add_argument( + "--end-time", + default=None, + help="Run window end in ISO 8601 / Azure CLI time format.", + ) + parser.add_argument( + "--phase", + action="append", + default=[], + metavar="NAME=TIMESTAMP", + help="Optional phase marker to store in the timeline, for example apply_start=2026-04-09T01:00:00Z.", + ) + parser.add_argument( + "--window-file", + type=Path, + default=None, + help="Optional JSON file containing run-window fields. If supplied, start/end come from the file unless explicitly overridden.", + ) + parser.add_argument( + "--command-timeline-file", + type=Path, + default=None, + help="Optional command-timeline.json emitted by validate_azurefox_lab.py. If supplied, it will be copied into the bundle and rendered in timeline.md.", + ) + parser.add_argument( + "--output-root", + type=Path, + default=Path("proof-artifacts") / "activity-log", + help="Directory where the bundle directory will be created.", + ) + parser.add_argument( + "--max-events", + type=int, + default=5000, + help="Maximum number of Activity Log events to request from Azure CLI.", + ) + parser.add_argument( + "--no-zip", + action="store_true", + help="Leave the bundle as a directory only and skip writing a zip archive.", + ) + return parser.parse_args() + + +def utc_now() -> datetime: + return datetime.now(UTC) + + +def default_run_id() -> str: + return utc_now().strftime("activity-%Y%m%dT%H%M%SZ") + + +def load_json(path: Path) -> dict[str, Any]: + try: + value = json.loads(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise SystemExit(f"Window file not found: {path}") from exc + except json.JSONDecodeError as exc: + raise SystemExit(f"Window file is not valid JSON: {path}") from exc + if not isinstance(value, dict): + raise SystemExit(f"Window file must contain a JSON object: {path}") + return value + + +def load_command_timeline(path: Path) -> dict[str, Any]: + value = load_json(path) + command_runs = value.get("command_runs") + if not isinstance(command_runs, list): + raise SystemExit(f"Command timeline file is missing a command_runs list: {path}") + return value + + +def parse_phase_markers(items: list[str]) -> dict[str, str]: + markers: dict[str, str] = {} + for item in items: + if "=" not in item: + raise SystemExit(f"Invalid --phase value '{item}'. Use NAME=TIMESTAMP.") + name, value = item.split("=", 1) + name = name.strip() + value = value.strip() + if not name or not value: + raise SystemExit(f"Invalid --phase value '{item}'. Use NAME=TIMESTAMP.") + markers[name] = value + return markers + + +def normalize_window(args: argparse.Namespace) -> dict[str, Any]: + window = load_json(args.window_file) if args.window_file else {} + run_id = args.run_id or str(window.get("run_id") or default_run_id()) + start_time = args.start_time or window.get("start_utc") or window.get("start_time") + end_time = args.end_time or window.get("end_utc") or window.get("end_time") + if not start_time or not end_time: + raise SystemExit("Provide --start-time and --end-time, or a --window-file with start/end fields.") + + phases = parse_phase_markers(args.phase) + if not phases: + for key, value in window.items(): + if key in {"run_id", "start_utc", "start_time", "end_utc", "end_time"}: + continue + if key.endswith("_utc") and isinstance(value, str): + phases[key.removesuffix("_utc")] = value + + normalized = { + "run_id": run_id, + "start_utc": start_time, + "end_utc": end_time, + } + for name, value in sorted(phases.items()): + normalized[f"{name}_utc"] = value + return normalized + + +def run_json(cmd: list[str]) -> Any: + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise SystemExit( + f"Command failed ({result.returncode}): {' '.join(cmd)}\n" + f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" + ) + try: + return json.loads(result.stdout) + except json.JSONDecodeError as exc: + raise SystemExit( + f"Command did not return JSON: {' '.join(cmd)}\nSTDOUT:\n{result.stdout}" + ) from exc + + +def current_subscription_id() -> str: + payload = run_json(["az", "account", "show", "--output", "json", "--only-show-errors"]) + subscription_id = payload.get("id") + if not subscription_id: + raise SystemExit("Azure CLI did not return a current subscription id.") + return str(subscription_id) + + +def fetch_activity_log( + *, + subscription_id: str, + start_time: str, + end_time: str, + max_events: int, +) -> list[dict[str, Any]]: + payload = run_json( + [ + "az", + "monitor", + "activity-log", + "list", + "--subscription", + subscription_id, + "--start-time", + start_time, + "--end-time", + end_time, + "--max-events", + str(max_events), + "--output", + "json", + "--only-show-errors", + ] + ) + if not isinstance(payload, list): + raise SystemExit("Azure CLI returned a non-list payload for Activity Log query.") + return payload + + +def parse_timestamp(value: str) -> datetime | None: + text = value.strip() + if text.endswith("Z"): + text = text[:-1] + "+00:00" + try: + return datetime.fromisoformat(text) + except ValueError: + return None + + +def build_timeline( + window: dict[str, Any], + *, + event_count: int, + max_events: int, + subscription_id: str, + command_timeline: dict[str, Any] | None = None, +) -> str: + lines = [ + "# Azure Activity Log Timeline", + "", + f"- run_id: `{window['run_id']}`", + f"- subscription_id: `{subscription_id}`", + f"- start_utc: `{window['start_utc']}`", + f"- end_utc: `{window['end_utc']}`", + f"- activity_log_event_count: `{event_count}`", + f"- activity_log_max_events: `{max_events}`", + "", + "## Phase Markers", + "", + ] + + marker_items: list[tuple[str, str]] = [] + for key, value in window.items(): + if key in {"run_id", "start_utc", "end_utc"}: + continue + if key.endswith("_utc") and isinstance(value, str): + marker_items.append((key.removesuffix("_utc"), value)) + marker_items.sort(key=lambda item: (parse_timestamp(item[1]) or datetime.max, item[0])) + + if marker_items: + for name, value in marker_items: + lines.append(f"- `{name}` at `{value}`") + else: + lines.append("- No additional phase markers were provided.") + + lines.extend( + [ + "", + "## AzureFox Command Markers", + "", + ] + ) + + if command_timeline: + command_runs = command_timeline.get("command_runs", []) + if command_runs: + for item in command_runs: + command = item.get("command", "unknown") + sequence = item.get("sequence") + started_at = item.get("started_at_utc", "unknown") + finished_at = item.get("finished_at_utc", "unknown") + duration = item.get("duration_seconds") + status = item.get("status", "unknown") + duration_text = ( + f"{duration:.3f}s" + if isinstance(duration, (int, float)) + else str(duration or "unknown") + ) + prefix = f"[{sequence:02d}] " if isinstance(sequence, int) else "" + lines.append( + f"- {prefix}`{command}` started `{started_at}`, finished `{finished_at}`, " + f"duration `{duration_text}`, status `{status}`" + ) + else: + lines.append("- Command timeline file was supplied, but it did not contain any command runs.") + else: + lines.append("- No AzureFox command timeline was supplied for this bundle.") + + lines.extend( + [ + "", + "## Analyst Note", + "", + "Correlate the phase markers and command markers above with `eventTimestamp`,", + "`operationName`, `resourceGroupName`, and `resourceId` fields in", + "`azure-activity-log.json`.", + "", + ] + ) + return "\n".join(lines) + + +def write_bundle( + *, + bundle_dir: Path, + window: dict[str, Any], + subscription_id: str, + activity_log: list[dict[str, Any]], + max_events: int, + command_timeline: dict[str, Any] | None, + zip_bundle: bool, +) -> None: + bundle_dir.mkdir(parents=True, exist_ok=True) + metadata = { + "bundle_generated_at_utc": utc_now().isoformat().replace("+00:00", "Z"), + "command_timeline_count": len((command_timeline or {}).get("command_runs", [])), + "command_timeline_included": command_timeline is not None, + "subscription_id": subscription_id, + "activity_log_event_count": len(activity_log), + "activity_log_max_events": max_events, + "source": "az monitor activity-log list", + } + (bundle_dir / "run-window.json").write_text( + json.dumps(window, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (bundle_dir / "metadata.json").write_text( + json.dumps(metadata, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (bundle_dir / "azure-activity-log.json").write_text( + json.dumps(activity_log, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + if command_timeline is not None: + (bundle_dir / "command-timeline.json").write_text( + json.dumps(command_timeline, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (bundle_dir / "timeline.md").write_text( + build_timeline( + window, + event_count=len(activity_log), + max_events=max_events, + subscription_id=subscription_id, + command_timeline=command_timeline, + ) + + "\n", + encoding="utf-8", + ) + + if not zip_bundle: + return + + zip_path = bundle_dir.with_suffix(".zip") + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: + for file_path in sorted(bundle_dir.iterdir()): + archive.write(file_path, arcname=file_path.name) + + +def main() -> int: + args = parse_args() + window = normalize_window(args) + subscription_id = args.subscription or current_subscription_id() + bundle_dir = (args.output_root / window["run_id"]).resolve() + command_timeline = ( + load_command_timeline(args.command_timeline_file.resolve()) + if args.command_timeline_file is not None + else None + ) + activity_log = fetch_activity_log( + subscription_id=subscription_id, + start_time=window["start_utc"], + end_time=window["end_utc"], + max_events=args.max_events, + ) + write_bundle( + bundle_dir=bundle_dir, + window=window, + subscription_id=subscription_id, + activity_log=activity_log, + max_events=args.max_events, + command_timeline=command_timeline, + zip_bundle=not args.no_zip, + ) + print(f"Activity log bundle written to {bundle_dir}") + if not args.no_zip: + print(f"Zip archive written to {bundle_dir.with_suffix('.zip')}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/validate_azurefox_lab.py b/scripts/validate_azurefox_lab.py index 5c0e1b6..c1fb494 100755 --- a/scripts/validate_azurefox_lab.py +++ b/scripts/validate_azurefox_lab.py @@ -7,6 +7,7 @@ import subprocess import sys import time +from datetime import UTC, datetime from pathlib import Path from typing import Any @@ -14,6 +15,8 @@ COMMANDS = [ "whoami", "inventory", + "automation", + "devops", "arm-deployments", "env-vars", "tokens-credentials", @@ -21,12 +24,16 @@ "principals", "permissions", "privesc", + "role-trusts", + "lighthouse", + "cross-tenant", "resource-trusts", "auth-policies", "managed-identities", "keyvault", "storage", "vms", + "vmss", "nics", "dns", "endpoints", @@ -39,16 +46,6 @@ "acr", "databases", "snapshots-disks", - "role-trusts", -] - -ALL_CHECK_SECTION_ORDER = [ - "config", - "secrets", - "resource", - "network", - "compute", - "identity", ] AUTH_POLICY_FINDINGS = { @@ -58,7 +55,7 @@ "users-can-register-apps": "auth-policy-users-can-register-apps", } -RUN_MODE_CHOICES = ("full", "commands-only", "all-checks-only") +RUN_MODE_CHOICES = ("full", "commands-only") HEARTBEAT_INTERVAL_SECONDS = 30 SLOW_COMMAND_NOTES = { "role-trusts": ( @@ -104,8 +101,7 @@ def parse_args() -> argparse.Namespace: default="full", help=( "Validation scope to run: full executes the release-gated standalone command set; " - "commands-only is an explicit standalone-only rerun mode; " - "all-checks-only skips standalone commands and runs section wrappers only." + "commands-only is an explicit standalone-only rerun mode." ), ) parser.add_argument( @@ -121,6 +117,10 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() +def utc_timestamp() -> str: + return datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z") + + def run_json( cmd: list[str], cwd: Path, @@ -163,27 +163,10 @@ def log_progress(message: str) -> None: print(message, flush=True) -def ordered_all_checks_sections(sections: list[str]) -> list[str]: - preferred_positions = { - section: index for index, section in enumerate(ALL_CHECK_SECTION_ORDER) - } - return sorted( - sections, - key=lambda section: ( - preferred_positions.get(section, len(ALL_CHECK_SECTION_ORDER)), - section, - ), - ) - - def mode_runs_commands(mode: str) -> bool: return mode in {"full", "commands-only"} -def mode_runs_all_checks(mode: str) -> bool: - return mode == "all-checks-only" - - def selected_commands(skipped_commands: set[str]) -> list[str]: return [command for command in COMMANDS if command not in skipped_commands] @@ -205,6 +188,33 @@ def read_manifest(lab_dir: Path) -> dict[str, Any]: return value +def write_command_timeline( + artifacts_dir: Path, + *, + mode: str, + subscription_id: str, + commands: list[str], + skipped_commands: set[str], + started_at_utc: str, + command_runs: list[dict[str, Any]], + finished_at_utc: str | None = None, +) -> None: + payload = { + "mode": mode, + "subscription_id": subscription_id, + "validator_started_at_utc": started_at_utc, + "validator_finished_at_utc": finished_at_utc, + "requested_commands": commands, + "skipped_commands": sorted(skipped_commands), + "command_count": len(command_runs), + "command_runs": command_runs, + } + (artifacts_dir / "command-timeline.json").write_text( + json.dumps(payload, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + + def run_azurefox( azurefox_dir: Path, python_bin: str, @@ -212,15 +222,24 @@ def run_azurefox( artifacts_dir: Path, mode: str, commands: list[str], - all_checks_sections: list[str], -) -> tuple[dict[str, Any], dict[str, Path], dict[str, Any], dict[str, Path]]: + skipped_commands: set[str], +) -> tuple[dict[str, Any], dict[str, Path]]: outputs: dict[str, Any] = {} loot_paths: dict[str, Path] = {} - run_summaries: dict[str, Any] = {} - run_summary_paths: dict[str, Path] = {} env = os.environ.copy() pythonpath = str(azurefox_dir / "src") env["PYTHONPATH"] = f"{pythonpath}{os.pathsep}{env['PYTHONPATH']}" if env.get("PYTHONPATH") else pythonpath + validator_started_at_utc = utc_timestamp() + command_runs: list[dict[str, Any]] = [] + write_command_timeline( + artifacts_dir, + mode=mode, + subscription_id=subscription_id, + commands=commands, + skipped_commands=skipped_commands, + started_at_utc=validator_started_at_utc, + command_runs=command_runs, + ) if mode_runs_commands(mode): loot_root = artifacts_dir / "loot" @@ -228,6 +247,7 @@ def run_azurefox( command_total = len(commands) for index, command in enumerate(commands, start=1): step_started = time.monotonic() + command_started_at_utc = utc_timestamp() outdir = artifacts_dir / command outdir.mkdir(parents=True, exist_ok=True) log_progress( @@ -236,23 +256,47 @@ def run_azurefox( slow_note = SLOW_COMMAND_NOTES.get(command) if slow_note: log_progress(f"[note {index}/{command_total}] azurefox {command}: {slow_note}") - payload = run_json( - [ - python_bin, - "-m", - "azurefox", - "--subscription", - subscription_id, - "--output", - "json", - "--outdir", - str(outdir), - command, - ], - cwd=azurefox_dir, - env=env, - progress_label=f"azurefox {command}", - ) + try: + payload = run_json( + [ + python_bin, + "-m", + "azurefox", + "--subscription", + subscription_id, + "--output", + "json", + "--outdir", + str(outdir), + command, + ], + cwd=azurefox_dir, + env=env, + progress_label=f"azurefox {command}", + ) + except Exception as exc: + command_runs.append( + { + "artifacts_dir": str(outdir), + "command": command, + "duration_seconds": round(time.monotonic() - step_started, 3), + "error": str(exc), + "finished_at_utc": utc_timestamp(), + "sequence": index, + "started_at_utc": command_started_at_utc, + "status": "failed", + } + ) + write_command_timeline( + artifacts_dir, + mode=mode, + subscription_id=subscription_id, + commands=commands, + skipped_commands=skipped_commands, + started_at_utc=validator_started_at_utc, + command_runs=command_runs, + ) + raise outputs[command] = payload (artifacts_dir / f"{command}.json").write_text( json.dumps(payload, indent=2, sort_keys=True) + "\n", @@ -264,57 +308,44 @@ def run_azurefox( target = loot_root / f"{command}.json" target.write_text(emitted_loot.read_text(encoding="utf-8"), encoding="utf-8") loot_paths[command] = target - log_progress( - f"[done {index}/{command_total}] azurefox {command} " - f"({time.monotonic() - step_started:.1f}s)" + command_runs.append( + { + "artifacts_dir": str(outdir), + "command": command, + "duration_seconds": round(time.monotonic() - step_started, 3), + "finished_at_utc": utc_timestamp(), + "loot_path": str(target), + "payload_path": str(artifacts_dir / f"{command}.json"), + "sequence": index, + "started_at_utc": command_started_at_utc, + "status": "succeeded", + } ) - - if mode_runs_all_checks(mode): - section_total = len(all_checks_sections) - for index, section in enumerate(all_checks_sections, start=1): - step_started = time.monotonic() - checkpoint_dir = artifacts_dir / f"{section}-checkpoint" - checkpoint_dir.mkdir(parents=True, exist_ok=True) - log_progress( - f"[run {index}/{section_total}] azurefox all-checks --section {section} " - f"-> {checkpoint_dir}" - ) - run_summary = run_json( - [ - python_bin, - "-m", - "azurefox", - "--subscription", - subscription_id, - "--output", - "json", - "--outdir", - str(checkpoint_dir), - "all-checks", - "--section", - section, - ], - cwd=azurefox_dir, - env=env, - progress_label=f"azurefox all-checks --section {section}", - ) - run_summary_path = checkpoint_dir / "run-summary.json" - if not run_summary_path.exists(): - raise AssertionError( - f"AzureFox did not emit {section}-checkpoint/run-summary.json" - ) - (artifacts_dir / f"all-checks-{section}.json").write_text( - json.dumps(run_summary, indent=2, sort_keys=True) + "\n", - encoding="utf-8", + write_command_timeline( + artifacts_dir, + mode=mode, + subscription_id=subscription_id, + commands=commands, + skipped_commands=skipped_commands, + started_at_utc=validator_started_at_utc, + command_runs=command_runs, ) - run_summaries[section] = run_summary - run_summary_paths[section] = run_summary_path log_progress( - f"[done {index}/{section_total}] azurefox all-checks --section {section} " + f"[done {index}/{command_total}] azurefox {command} " f"({time.monotonic() - step_started:.1f}s)" ) - return outputs, loot_paths, run_summaries, run_summary_paths + write_command_timeline( + artifacts_dir, + mode=mode, + subscription_id=subscription_id, + commands=commands, + skipped_commands=skipped_commands, + started_at_utc=validator_started_at_utc, + command_runs=command_runs, + finished_at_utc=utc_timestamp(), + ) + return outputs, loot_paths def assert_true(condition: bool, message: str) -> None: @@ -377,6 +408,13 @@ def find_vm(payload: dict[str, Any], vm_name: str) -> dict[str, Any]: raise AssertionError(f"VM asset '{vm_name}' not found in vms output") +def find_vmss_asset(payload: dict[str, Any], vmss_name: str) -> dict[str, Any]: + for asset in payload.get("vmss_assets", []): + if asset.get("name") == vmss_name: + return asset + raise AssertionError(f"vmss output missing asset '{vmss_name}'") + + def find_nic(payload: dict[str, Any], nic_name: str) -> dict[str, Any]: for asset in payload.get("nic_assets", []): if asset.get("name") == nic_name: @@ -433,6 +471,13 @@ def find_app_service(payload: dict[str, Any], name: str) -> dict[str, Any]: raise AssertionError(f"app-services output missing asset '{name}'") +def find_automation_account(payload: dict[str, Any], name: str) -> dict[str, Any]: + for asset in payload.get("automation_accounts", []): + if asset.get("name") == name: + return asset + raise AssertionError(f"automation output missing account '{name}'") + + def find_function_app(payload: dict[str, Any], name: str) -> dict[str, Any]: for asset in payload.get("function_apps", []): if asset.get("name") == name: @@ -578,8 +623,6 @@ def validate_outputs( loot_paths: dict[str, Path], executed_commands: list[str], skipped_commands: set[str], - run_summaries: dict[str, Any], - run_summary_paths: dict[str, Path], ) -> tuple[list[str], list[str], list[str]]: checks: list[str] = [] mismatches: list[str] = [] @@ -744,6 +787,33 @@ def validate_outputs( elif "role-trusts" in skipped_commands: checks.append("role-trusts was intentionally skipped on this rerun after an earlier baseline validation") + lighthouse = outputs["lighthouse"] + assert_true( + isinstance(lighthouse.get("lighthouse_delegations", []), list), + "lighthouse did not return a lighthouse_delegations list", + ) + assert_true( + not lighthouse.get("issues"), + "lighthouse reported unexpected collection issues", + ) + checks.append("lighthouse completed cleanly and kept delegated-management evidence explicit without requiring the lab to invent a second tenant") + + cross_tenant = outputs["cross-tenant"] + assert_true( + isinstance(cross_tenant.get("cross_tenant_paths", []), list), + "cross-tenant did not return a cross_tenant_paths list", + ) + unexpected_cross_tenant_issues = [ + issue + for issue in cross_tenant.get("issues", []) + if (issue.get("context") or {}).get("collector") != "auth_policies.security_defaults" + ] + assert_true( + not unexpected_cross_tenant_issues, + "cross-tenant reported unexpected issues outside the known Graph partial-read boundary", + ) + checks.append("cross-tenant completed and kept outside-tenant evidence tenant-shaped instead of pretending it was a deterministic lab census") + auth_policies = outputs["auth-policies"] assert_true( manifest["auth_policies"]["validation_mode"] == "non-invasive", @@ -790,9 +860,89 @@ def validate_outputs( follow_ups.append( "Keep auth-policies wording evidence-based when security defaults or Conditional Access " "surfaces are partially unreadable; partial visibility should remain explicit." - ) + ) checks.append("auth-policies stayed in metadata-validation mode and handled partial Graph visibility explicitly") + automation = outputs["automation"] + expected_automation = phase4_manifest.get("automation", {}).get("ops") + if expected_automation: + automation_account = find_automation_account(automation, expected_automation["name"]) + expected_identity = expected_automation["identity_type"] + visible_identity = automation_account.get("identity_type") + if expected_identity is None: + assert_true( + visible_identity is None, + "automation account identity type mismatch", + ) + elif visible_identity != expected_identity: + mismatches.append( + "automation did not return the expected managed-identity type for the lab-owned " + f"Automation account; expected {expected_identity!r}, got {visible_identity!r}." + ) + follow_ups.append( + "Keep automation identity wording evidence-based until AzureFox reliably surfaces " + "managed-identity type for Automation accounts in the live read path." + ) + for field_name in ( + "runbook_count", + "schedule_count", + "job_schedule_count", + "webhook_count", + "hybrid_worker_group_count", + "credential_count", + "certificate_count", + "connection_count", + "variable_count", + "encrypted_variable_count", + ): + assert_true( + automation_account.get(field_name) == expected_automation[field_name], + f"automation account {field_name} mismatch", + ) + if visible_identity: + checks.append( + "automation surfaced the lab-owned Automation account with the expected visible identity and zero-object execution posture" + ) + else: + checks.append( + "automation surfaced the lab-owned Automation account and matched the current visible zero-object execution posture even though the current read path did not return an identity type" + ) + + devops = outputs["devops"] + devops_config_issue = next( + ( + issue + for issue in devops.get("issues", []) + if (issue.get("context") or {}).get("collector") == "devops" + ), + None, + ) + devops_organization = (devops.get("metadata") or {}).get("devops_organization") + if devops_organization: + assert_true( + devops_config_issue is None, + "devops unexpectedly reported an organization-configuration issue despite metadata.devops_organization", + ) + assert_true( + isinstance(devops.get("pipelines", []), list), + "devops did not return a pipelines list", + ) + checks.append( + "devops used the configured Azure DevOps organization and returned pipeline evidence without a configuration error" + ) + else: + assert_true( + devops_config_issue is not None, + "devops did not record the expected Azure DevOps organization configuration issue", + ) + assert_true( + "not configured" in str(devops_config_issue.get("message", "")).lower(), + "devops missing-organization issue message drifted unexpectedly", + ) + checks.append( + "devops stayed truthful about the missing Azure DevOps organization instead of pretending pipeline coverage existed" + ) + managed_identities = outputs["managed-identities"] identity = find_identity(managed_identities, identity_name) assert_true( @@ -919,6 +1069,35 @@ def validate_outputs( ) checks.append("vms reported the public VM and attached identity without overstating VMSS coverage") + vmss = outputs["vmss"] + expected_vmss = phase3_manifest["vmss"]["api"] + vmss_asset = find_vmss_asset(vmss, expected_vmss["name"]) + assert_true( + vmss_asset.get("sku_name") == expected_vmss["sku_name"], + "vmss SKU mismatch", + ) + assert_true( + vmss_asset.get("instance_count") == expected_vmss["instance_count"], + "vmss instance count mismatch", + ) + assert_true( + vmss_asset.get("identity_type") == expected_vmss["identity_type"], + "vmss identity type mismatch", + ) + assert_true( + vmss_asset.get("nic_configuration_count") == expected_vmss["nic_configuration_count"], + "vmss NIC configuration count mismatch", + ) + assert_true( + vmss_asset.get("public_ip_configuration_count") == expected_vmss["public_ip_configuration_count"], + "vmss public IP configuration count mismatch", + ) + assert_true( + expected_vmss["subnet_id"] in set(vmss_asset.get("subnet_ids", [])), + "vmss subnet reference mismatch", + ) + checks.append("vmss surfaced the internal scale-set footprint and network placement without inventing public-frontend exposure") + nics = outputs["nics"] vm_primary_nic = phase3_manifest["nics"]["vm_primary"] nic_asset = find_nic(nics, vm_primary_nic["name"]) @@ -1521,31 +1700,6 @@ def validate_outputs( assert_true(loot_paths.get(command, Path()).exists(), f"{command} loot artifact missing") checks.append("all single-command runs returned JSON payloads and emitted loot artifacts") - if mode_runs_all_checks(mode): - for section, expected_commands in manifest["all_checks_sections"].items(): - run_summary = run_summaries[section] - run_summary_path = run_summary_paths[section] - assert_true(run_summary["metadata"]["command"] == "all-checks", f"{section} run-summary command mismatch") - assert_true(run_summary.get("section") == section, f"{section} run-summary section mismatch") - result_map = {item.get("command"): item for item in run_summary.get("results", [])} - assert_true( - set(expected_commands).issubset(result_map), - f"{section} run-summary missing expected commands", - ) - for command in expected_commands: - result = result_map[command] - assert_true(result.get("status") == "ok", f"{section} run-summary reported non-ok status for {command}") - artifact_paths = result.get("artifact_paths") or {} - for label, path in artifact_paths.items(): - assert_true( - path and Path(path).exists(), - f"{section} run-summary missing {label} artifact for {command}", - ) - assert_true(run_summary_path.exists(), f"{section} run-summary.json path is missing on disk") - checks.append( - "all-checks emitted complete artifact sets for identity, network, compute, config, secrets, and resource sections" - ) - return checks, mismatches, follow_ups @@ -1555,16 +1709,12 @@ def write_summary( checks: list[str], mismatches: list[str], follow_ups: list[str], - run_summary_paths: dict[str, Path], ) -> None: summary = { "checks": checks, "follow_ups": follow_ups, "mismatches": mismatches, "mode": mode, - "run_summary_paths": { - section: str(path) for section, path in sorted(run_summary_paths.items()) - }, "status": "pass", } (artifacts_dir / "summary.json").write_text( @@ -1642,18 +1792,15 @@ def main() -> int: if skipped_commands: log_progress(f"[info] skipped standalone commands: {', '.join(sorted(skipped_commands))}") manifest = read_manifest(lab_dir) - all_checks_sections = ordered_all_checks_sections( - list(manifest["all_checks_sections"].keys()) - ) commands = selected_commands(skipped_commands) - outputs, loot_paths, run_summaries, run_summary_paths = run_azurefox( + outputs, loot_paths = run_azurefox( azurefox_dir=azurefox_dir, python_bin=args.python, subscription_id=manifest["subscription_id"], artifacts_dir=artifacts_dir, mode=args.mode, commands=commands, - all_checks_sections=all_checks_sections, + skipped_commands=skipped_commands, ) checks, mismatches, follow_ups = validate_outputs( manifest, @@ -1662,10 +1809,8 @@ def main() -> int: loot_paths, commands, skipped_commands, - run_summaries, - run_summary_paths, ) - write_summary(artifacts_dir, args.mode, checks, mismatches, follow_ups, run_summary_paths) + write_summary(artifacts_dir, args.mode, checks, mismatches, follow_ups) print(f"Validation complete. Artifacts written to {artifacts_dir}") return 0