diff --git a/.github/workflows/uat-daytime.yaml b/.github/workflows/uat-daytime.yaml new file mode 100644 index 000000000..df0a325d1 --- /dev/null +++ b/.github/workflows/uat-daytime.yaml @@ -0,0 +1,256 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: UAT Daytime Human-Access + +# The day side of the day/night cycle (#1281, DC8): stand up ONE long-lived, +# human-facing deployment per cloud for the working day, then tear it down +# before the nightly batch. The cloud→flavor split is data, not code — it comes +# from the `daytime-intent` column of the reservation registry +# (infra/uat/reservations.yaml), defaulting to AWS=training / GCP=inference. +# +# This workflow is a THIN SCHEDULER. It owns no lifecycle mechanics: it enumerates +# the daytime rotation from the broker and dispatches the shared uat-run.yaml +# once per reservation with lifecycle=daytime-up (morning) or daytime-down +# (evening). DC2 owns provision-and-hold, teardown, and the pre-batch guard; +# uat-run.yaml owns the per-reservation lease. Because the daytime runs go through +# that same dispatch surface, they contend on the SAME lease as the nightly batch +# — CI and human use never overlap on one reservation. +# +# The daytime cluster is NOT a UAT cell: daytime-up stops after deploy (no CUJ), +# so it emits NO evidence bundle and produces NO TestGrid column. Access is +# distributed OUT-OF-BAND (see docs/contributor/uat.md), never through this CI +# path. +on: + schedule: + # Times are UTC. The nightly batch opens at 04:00 UTC (uat-nightly-batch.yaml). + # KEEP IN LOCKSTEP with env.MORNING_UP_CRON / env.EVENING_DOWN_CRON below — + # the Resolve-action step maps `github.event.schedule` to up/down by an exact + # string match against those env values (the `on:` block cannot reference + # env, so this coupling is by convention). Editing one cron without the other + # breaks that edge with `::error::could not resolve daytime action` when it fires. + - cron: '0 15 * * *' # morning handoff — provision + deploy + HOLD (daytime-up) + - cron: '0 2 * * *' # evening teardown — release the reservation ~2h before the batch (daytime-down) + workflow_dispatch: + inputs: + action: + description: 'up: provision + deploy + hold the daytime cluster. down: tear it down and release the reservation.' + type: choice + options: [up, down] + required: true + +permissions: + contents: read + +# Serialize daytime scheduler runs so a manual dispatch does not fan out a +# second up/down alongside a cron one. (uat-run.yaml's per-reservation lease +# would queue any duplicates anyway; there is no reason to start them.) +concurrency: + group: uat-daytime + cancel-in-progress: false + +# Cron edges map to lifecycle actions; a manual dispatch names the action +# directly. Consumed by the enumerate job's Resolve action step, which maps +# `github.event.schedule` to up/down by an EXACT string match against these. +# These two values MUST stay identical to the `on.schedule` crons above — an +# unsynced edit breaks the drifted edge (fails loud, but only when it fires). +env: + MORNING_UP_CRON: '0 15 * * *' # must equal the daytime-up cron in on.schedule + EVENING_DOWN_CRON: '0 2 * * *' # must equal the daytime-down cron in on.schedule + +jobs: + # Resolve the up/down action for this trigger and enumerate the daytime + # rotation (reservation + intent) from the registry into a JSON matrix. Keeps + # the scheduler data-driven: onboarding a daytime reservation is a + # `daytime-intent:` edit in infra/uat/reservations.yaml, no workflow change. + enumerate: + if: github.repository == 'nvidia/aicr' + runs-on: ubuntu-latest + outputs: + action: ${{ steps.action.outputs.action }} + lifecycle: ${{ steps.action.outputs.lifecycle }} + matrix: ${{ steps.list.outputs.matrix }} + steps: + - name: Checkout + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + persist-credentials: false + - name: Resolve action + id: action + env: + # github.event.schedule is the cron string that fired (empty on + # workflow_dispatch); inputs.action is set only on workflow_dispatch. + EVENT_NAME: ${{ github.event_name }} + SCHEDULE: ${{ github.event.schedule }} + INPUT_ACTION: ${{ inputs.action }} + run: | + set -euo pipefail + if [[ "${EVENT_NAME}" == "workflow_dispatch" ]]; then + action="${INPUT_ACTION}" + elif [[ "${SCHEDULE}" == "${MORNING_UP_CRON}" ]]; then + action="up" + elif [[ "${SCHEDULE}" == "${EVENING_DOWN_CRON}" ]]; then + action="down" + else + echo "::error::could not resolve daytime action (event='${EVENT_NAME}', schedule='${SCHEDULE}')" + exit 1 + fi + case "${action}" in + up|down) ;; + *) echo "::error::invalid action '${action}' (want up|down)"; exit 1 ;; + esac + echo "action=${action}" >> "$GITHUB_OUTPUT" + echo "lifecycle=daytime-${action}" >> "$GITHUB_OUTPUT" + echo "Daytime action: ${action} (lifecycle=daytime-${action})" + - name: Load versions + id: versions + uses: ./.github/actions/load-versions + - name: Setup Go + uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0 + with: + go-version: '${{ steps.versions.outputs.go }}' + cache: true + cache-dependency-path: | + go.sum + vendor/modules.txt + - name: Enumerate daytime rotation + id: list + env: + GOFLAGS: -mod=vendor + ACTION: ${{ steps.action.outputs.action }} + run: | + set -euo pipefail + go build -o ./bin/uat-broker ./tools/uat-broker + # Compact to one line for GITHUB_OUTPUT; the matrix is a JSON array of + # {reservation,intent} objects consumed as strategy.matrix.include. + matrix=$(./bin/uat-broker reservations --daytime | jq -c .) + echo "matrix=${matrix}" >> "$GITHUB_OUTPUT" + echo "Daytime rotation: ${matrix}" + if [[ "${matrix}" == "[]" ]]; then + echo "::notice::no reservations carry a daytime-intent; nothing to ${ACTION}." + fi + + # One matrix leg per daytime reservation (parallel — independent hardware). + # Each leg DISPATCHES uat-run.yaml once (workflow_dispatch is exempt from the + # GITHUB_TOKEN recursion rule and always creates a run) and watches it to + # completion so a failed morning handoff / evening teardown surfaces here. The + # dispatched uat-run.yaml runs top-level with its own permissions and holds the + # reservation lease; this job only needs actions:write to dispatch + watch. + drive: + needs: enumerate + if: needs.enumerate.outputs.matrix != '[]' + runs-on: ubuntu-latest + # A daytime-up dispatch (provision + deploy) can run ~90 min; watching it + # blocks this job for that long. Stay under GitHub's 6h hosted-job cap. + timeout-minutes: 200 + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(needs.enumerate.outputs.matrix) }} + permissions: + contents: read # checkout + actions: write # dispatch uat-run.yaml and watch the dispatched run + steps: + - name: Checkout + uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + persist-credentials: false + - name: Dispatch and watch the daytime run + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + REF: ${{ github.ref_name }} + RESERVATION: ${{ matrix.reservation }} + INTENT: ${{ matrix.intent }} + ACTION: ${{ needs.enumerate.outputs.action }} + LIFECYCLE: ${{ needs.enumerate.outputs.lifecycle }} + run: | + set -euo pipefail + # Poll budget for finding the dispatched run (gh workflow run returns + # no run id): POLL_MAX_ATTEMPTS x POLL_INTERVAL_SECONDS seconds. + POLL_MAX_ATTEMPTS=24 + POLL_INTERVAL_SECONDS=5 + + # Unique per-dispatch key (this run's id + attempt + reservation + + # action) so the resolver below watches THIS run, never a concurrent + # manual dispatch of the same reservation. Mirrors the nightly + # controller's correlation scheme. + dispatch_key="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${RESERVATION}-${ACTION}" + # aicr_version is left empty (daytime always runs main/tip), so + # uat-run.yaml's run-name renders '@ main'; the dispatch_key suffix + # makes the title globally unique. + title="UAT ${RESERVATION} @ main #${dispatch_key}" + echo "::group::${title}" + + gh workflow run uat-run.yaml --repo "$REPO" --ref "$REF" \ + -f reservation="$RESERVATION" \ + -f intent="$INTENT" \ + -f lifecycle="$LIFECYCLE" \ + -f dispatch_key="$dispatch_key" + + # Resolve the dispatched run by its (globally unique) run-name; no + # createdAt filter, so a runner clock ahead of GitHub's API clock + # cannot falsely reject this dispatch. + run_id="" + for _ in $(seq 1 "$POLL_MAX_ATTEMPTS"); do + sleep "$POLL_INTERVAL_SECONDS" + run_id=$(gh run list --repo "$REPO" --workflow uat-run.yaml \ + --event workflow_dispatch --limit 50 \ + --json databaseId,displayTitle,createdAt 2>/dev/null \ + | jq -r --arg t "$title" \ + 'map(select(.displayTitle == $t)) + | sort_by(.createdAt) | .[-1].databaseId // empty' || true) + [[ -n "$run_id" ]] && break + done + if [[ -z "$run_id" ]]; then + echo "::error::Dispatched ${title} but could not resolve its run id." + echo "::endgroup::" + exit 1 + fi + + echo "Waiting on run ${run_id} ..." + # --exit-status is intentionally not fatal here; classify the run's + # conclusion below so a benign supersede is not mistaken for a failure. + gh run watch "$run_id" --repo "$REPO" --exit-status || true + + conclusion=$(gh run view "$run_id" --repo "$REPO" --json conclusion --jq '.conclusion' 2>/dev/null || echo "") + njobs=$(gh api "repos/${REPO}/actions/runs/${run_id}/jobs" --jq '.total_count' 2>/dev/null || echo "") + echo "::endgroup::" + if [[ "$conclusion" == "cancelled" && "$njobs" == "0" ]]; then + # Benign: the single-slot reservation lease dropped this run while it + # was still pending (the nightly batch or another daytime run held + # the reservation). Surface it — must not be silent — but do not fail: + # re-dispatch when the reservation frees. + echo "::warning::${title} was superseded while pending (dropped by the reservation lease); re-run when the reservation frees." + elif [[ "$conclusion" != "success" ]]; then + echo "::error::daytime ${ACTION} for ${RESERVATION} finished with conclusion '${conclusion:-unknown}'." + exit 1 + fi + echo "daytime ${ACTION} for ${RESERVATION} (${INTENT}): ${conclusion}" + + - name: Summary + if: always() + env: + SUMMARY_RESERVATION: ${{ matrix.reservation }} + SUMMARY_INTENT: ${{ matrix.intent }} + SUMMARY_ACTION: ${{ needs.enumerate.outputs.action }} + run: | + { + echo "## Daytime human-access — ${SUMMARY_ACTION}" + echo "" + printf '**Reservation:** `%s` · **Intent:** `%s` · **Action:** `%s`\n' \ + "$SUMMARY_RESERVATION" "$SUMMARY_INTENT" "$SUMMARY_ACTION" + echo "" + echo "Access is distributed out-of-band; this cluster emits no evidence bundle and no TestGrid column." + } >> "$GITHUB_STEP_SUMMARY" diff --git a/docs/contributor/uat.md b/docs/contributor/uat.md index e5f535848..579ed1b46 100644 --- a/docs/contributor/uat.md +++ b/docs/contributor/uat.md @@ -7,13 +7,13 @@ Each reserved GPU pool follows a daily cycle, with every phase acquiring the *same* per-reservation lease so CI and human use never overlap on one reservation: - **Night — the nightly batch.** On a cron, `uat-nightly-batch.yaml` runs the [version matrix](#the-version-matrix) per reservation — `main` plus the previous N stable releases — each cell a full provision → CUJ → evidence → publish → teardown. This is the `lifecycle=nightly` mode: provision-and-destroy under a run-scoped cluster name. -- **Morning — handoff.** Once the batch drains a reservation, the daytime human-access deployment is stood up on it with `lifecycle=daytime-up`: provision, deploy the stack, and **hold** (no teardown) under a stable, reservation-tagged cluster name. DC2 owns this provision-and-hold mechanic; DC8 owns *what* is deployed and how access is shared. -- **Day — human use.** The daytime cluster is used outside CI. -- **Evening — teardown.** The daytime cluster is torn down with `lifecycle=daytime-down` **before** the next night batch, releasing the reservation. +- **Morning — handoff.** Once the batch drains a reservation, the [daytime human-access deployment](#daytime-human-access-deployment) is stood up on it with `lifecycle=daytime-up`: provision, deploy the stack, and **hold** (no teardown) under a stable, reservation-tagged cluster name. The `uat-daytime.yaml` scheduler fires this on a morning cron for every reservation in the daytime rotation. DC2 owns the provision-and-hold mechanic; DC8 (`uat-daytime.yaml`) owns *which* flavor lands on *which* cloud and how access is shared. +- **Day — human use.** The daytime cluster is used outside CI — humans reach it [out-of-band](#daytime-human-access-deployment), never through the CI path. +- **Evening — teardown.** `uat-daytime.yaml` fires `lifecycle=daytime-down` on an evening cron to tear the daytime cluster down and release the reservation **before** the next night batch. The phases are independently scheduled (cron edges), not chained: the per-reservation lease — plus a [pre-batch guard](#pre-batch-guard) — keeps them from overlapping, so a crashed or overrunning phase never orphans the reservation. A hosted GitHub Actions job is capped at the runner's timeout (hours, not a whole working day), so a single lease-holding run cannot span the day; the lease only needs to cover the brief transition windows, and the steady-state daytime cluster's existence is tracked by its stable, reservation-tagged name rather than a continuously held run. -> What ships today is the **night side** (the nightly batch), the **lease + dispatch surface** every phase builds on, and DC2's **per-intent selection**, **daytime provision-and-hold / teardown mechanics**, and **pre-batch guard**. The daytime deployment's *workload content* and out-of-band access distribution are owned by DC8 and layer on top of the `daytime-up` mechanic. +> What ships today is the **night side** (the nightly batch), the **lease + dispatch surface** every phase builds on, DC2's **per-intent selection**, **daytime provision-and-hold / teardown mechanics**, and **pre-batch guard**, and DC8's **day side** — the `uat-daytime.yaml` scheduler that stands up one human-facing deployment per cloud each morning and tears it down each evening. The served-inference *workload* an `intent=inference` cluster runs (`phase_serve`) is still owned by DC3. ## Requesting a UAT run @@ -63,6 +63,71 @@ The `lifecycle` input selects one of three cluster lifecycles, all sharing the r The nightly per-run name isolates concurrent history (OCI tags, Terraform state) per run. The daytime name is **stable and reservation-tagged** so the evening `daytime-down` teardown and the nightly pre-batch guard can find the held cluster without tracking a run id. `skip_delete` is a nightly-only debugging escape and is ignored by the daytime lifecycles. +## Daytime human-access deployment + +The **day side** of the cycle (issue #1281, DC8) stands up **one long-lived, human-facing deployment per cloud** for the working day — a place to submit jobs, hit a served endpoint, and demo, **outside CI**. It is *not* a UAT cell: it emits **no evidence bundle** and produces **no TestGrid column**, and access is distributed [out-of-band](#reaching-the-daytime-cluster). The scarce reservation time is split between this human use and the nightly [version matrix](#the-version-matrix); the two never overlap on one reservation because both route through the same lease. + +### The cloud→flavor split + +Which cloud hosts which flavor is **data, not code**: the `daytime-intent` column of each row in `infra/uat/reservations.yaml`. A row with `daytime-intent: training` or `daytime-intent: inference` joins the daytime rotation; an empty/absent value keeps the reservation nightly-batch-only. The launch default splits the two flavors across the two clouds: + +| Reservation | Cloud | `daytime-intent` | Daytime deployment | +|-------------|-------|------------------|--------------------| +| `aws-h100` | AWS | `training` | training stack (Kubeflow `TrainJob`s) | +| `gcp-h100` | GCP | `inference` | inference stack (Dynamo, OpenAI-compatible endpoint) | + +Re-splitting (or adding a daytime reservation) is a registry edit — no workflow change. Only **one** reservation per cloud may carry a `daytime-intent` today: a single reservation cannot host both a held daytime cluster and the nightly batch at once, so *both* flavors on one cloud during the day is out of scope until more capacity lands. The `uatbroker` committed-registry test enforces the one-per-cloud invariant and the launch split. + +### The scheduler (`uat-daytime.yaml`) + +`uat-daytime.yaml` is a thin scheduler over the `daytime-up` / `daytime-down` mechanics — it owns no lifecycle logic. It enumerates the rotation (`uat-broker reservations --daytime` → a JSON `{reservation, intent}` matrix) and, once per reservation, dispatches the shared `uat-run.yaml` with the reservation's intent and the edge's lifecycle, then watches the dispatched run to completion so a failed handoff/teardown surfaces on the scheduler run. Because it goes through `uat-run.yaml`, each daytime run takes the **same per-reservation lease** as the nightly batch. + +Two cron edges (UTC), plus a manual `workflow_dispatch` with an `action: up | down` input: + +| Edge | Cron (UTC) | Action | Lifecycle dispatched | +|------|-----------|--------|----------------------| +| Morning handoff | `0 15 * * *` | `up` | `daytime-up` (provision + deploy + hold) | +| Evening teardown | `0 2 * * *` | `down` | `daytime-down` (tear down + release) | + +The evening teardown runs ~2h before the nightly batch opens (`0 4 * * *`), leaving margin for a ~10–15 min destroy. A manual run to stand up or tear down the whole rotation by hand: + +```bash +gh workflow run uat-daytime.yaml --repo NVIDIA/aicr --ref main -f action=up # morning handoff +gh workflow run uat-daytime.yaml --repo NVIDIA/aicr --ref main -f action=down # evening teardown +``` + +Different reservations run in parallel (independent hardware); a daytime run that finds its reservation still busy (an overrunning batch) *queues* on the lease rather than racing. + +### If the evening teardown is missed + +The teardown is not the only safety net. If a `daytime-down` is skipped or fails and the daytime cluster is still up when the nightly batch opens, DC2's [pre-batch guard](#pre-batch-guard) **blocks** the batch (fail-closed) rather than racing the held deployment. Recover by tearing the daytime cluster down — `gh workflow run uat-daytime.yaml -f action=down`, or a single `uat-run.yaml … -f lifecycle=daytime-down` for one reservation — then re-run the batch. + +### Reaching the daytime cluster + +Access is **out-of-band by design**: nothing here routes a kubeconfig or endpoint URL through the CI path, the evidence bundle, or the dashboard. Instead, the cluster's stable name is public knowledge and access is gated by **cloud IAM** on the daytime cluster — so an authorized operator mints their own kubeconfig directly and no credential ever transits CI: + +```bash +# AWS — training cluster (aicr-uat-day-aws-h100) +aws eks update-kubeconfig --region us-east-1 --name aicr-uat-day-aws-h100 + +# GCP — inference cluster (aicr-day-gcp-h100) +gcloud container clusters get-credentials aicr-day-gcp-h100 --region +``` + +**Training (AWS).** Submit Kubeflow `TrainJob`s against the held cluster — the same CUJ the nightly `intent=training` run exercises (see `demos/cuj1-training.md`). + +**Inference (GCP).** The `daytime-up` run deploys the Dynamo inference *platform* (dynamo-platform + KAI scheduler + DRA driver). Apply a `DynamoGraphDeployment` served workload — reuse an existing serve asset such as `demos/workloads/inference/vllm-agg.yaml`; DC8 does **not** invent a serving stack — then reach its OpenAI-compatible endpoint by port-forwarding the frontend: + +```bash +kubectl port-forward -n dynamo-system svc/vllm-agg-frontend 8000:8000 & +curl http://localhost:8000/v1/models +curl http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"model":"","messages":[{"role":"user","content":"hello"}]}' +``` + +Once DC3's `phase_serve` lands, the served workload is deployed automatically as part of `daytime-up`; until then it is a one-command manual apply on the held cluster. + ## Pre-batch guard A missed evening teardown must surface as a **blocked batch, never as silent contention** with the still-running daytime deployment. Before it provisions, every `nightly` run asserts that no daytime cluster (by the stable `aicr-uat-day-` / `aicr-day-` name) is still up on the target reservation. The check runs *after* the run has acquired the reservation lease and authenticated to the cloud, and *before* Bringup — so it fails fast rather than racing. It fails **closed**: only a definitive "cluster does not exist" (AWS `ResourceNotFoundException`, GCP `code=404`) clears the run to proceed; a throttle or auth error blocks the batch rather than being read as "clear." @@ -118,7 +183,7 @@ The values in this file are identifiers, **not secrets** — a reservation-id gr ## Roadmap -What ships now is the lease, the data-driven dispatch surface, the time-boxed nightly version matrix (`main` + previous-N stable releases, release cells installing released artifacts), superseded-run surfacing (the controller flags a dropped cell inline; the `uat-superseded-notice.yaml` observer catches ad-hoc dropped runs), per-intent selection, and the daytime provision-and-hold / teardown / pre-batch-guard mechanics. Still to come: +What ships now is the lease, the data-driven dispatch surface, the time-boxed nightly version matrix (`main` + previous-N stable releases, release cells installing released artifacts), superseded-run surfacing (the controller flags a dropped cell inline; the `uat-superseded-notice.yaml` observer catches ad-hoc dropped runs), per-intent selection, the daytime provision-and-hold / teardown / pre-batch-guard mechanics, and the DC8 [daytime human-access scheduler](#daytime-human-access-deployment) (`uat-daytime.yaml`) — one held deployment per cloud each working day, torn down before the batch, with out-of-band access. Still to come: -- **Served-inference CUJ (DC3).** The `phase_serve` step an `intent=inference` run will exercise on the deployed inference stack. -- **Daytime workload + access (DC8).** *What* the daytime cluster deploys on top of DC2's `daytime-up` mechanic, the served-endpoint exposure, and out-of-band access distribution. +- **Served-inference CUJ (DC3).** The `phase_serve` step an `intent=inference` run — nightly *and* the daytime inference cluster — will exercise on the deployed inference stack, deploying the served `DynamoGraphDeployment` automatically instead of the current manual apply. +- **Both flavors per cloud during the day.** Blocked on capacity — one reservation cannot hold both a daytime cluster and the nightly batch at once. Pulls once more infra lands. diff --git a/infra/uat/reservations.yaml b/infra/uat/reservations.yaml index e0223035f..b2f2be389 100644 --- a/infra/uat/reservations.yaml +++ b/infra/uat/reservations.yaml @@ -41,6 +41,16 @@ # test-config-dir directory holding the per-cloud AICRConfig test # files (the ./tests/uat//run phase runner roots # one level up, at the cluster-config) +# daytime-intent (optional) opts this reservation into the daytime +# human-access rotation (#1281, DC8) and picks the +# flavor stood up during the working day: training | +# inference. Empty/absent = nightly batch only. This is +# the configurable cloud→flavor split — a data choice, +# not hardcoded in a workflow — defaulting to +# AWS=training, GCP=inference. Only ONE reservation per +# cloud may carry a daytime-intent today: a single +# reservation cannot hold both a long-lived daytime +# cluster and the nightly batch at once. reservations: - name: aws-h100 cloud: aws @@ -49,6 +59,7 @@ reservations: gpu-count: 8 cluster-config-path: tests/uat/aws/cluster-config.yaml test-config-dir: tests/uat/aws/tests + daytime-intent: training - name: gcp-h100 cloud: gcp reservation-id: projects/nv-dgxcloudprod-20240108/reservations/h100-a3-mega-phase-1-general-pool-384-us-c1-reservation @@ -56,3 +67,4 @@ reservations: gpu-count: 8 cluster-config-path: tests/uat/gcp/cluster-config.yaml test-config-dir: tests/uat/gcp/tests + daytime-intent: inference diff --git a/pkg/uatbroker/model.go b/pkg/uatbroker/model.go index ac034906b..b70c57ab5 100644 --- a/pkg/uatbroker/model.go +++ b/pkg/uatbroker/model.go @@ -23,6 +23,18 @@ const ( // validClouds is the set of accepted Reservation.Cloud values. var validClouds = map[string]bool{CloudAWS: true, CloudGCP: true} +// Recognized recipe-intent values. The daytime human-access rotation (#1281, +// DC8) picks one flavor per reservation via Reservation.DaytimeIntent; these +// mirror the intents the per-cloud UAT pipelines accept. +const ( + IntentTraining = "training" + IntentInference = "inference" +) + +// validIntents is the set of accepted intent values (Reservation.DaytimeIntent +// and, downstream, the pipeline's intent input). +var validIntents = map[string]bool{IntentTraining: true, IntentInference: true} + // Reservation is one row of the UAT reservation registry // (infra/uat/reservations.yaml). Each row maps a reservation Name — the key // the day/night broker leases via the GitHub Actions concurrency group @@ -36,6 +48,22 @@ type Reservation struct { GPUCount int `yaml:"gpu-count"` ClusterConfigPath string `yaml:"cluster-config-path"` TestConfigDir string `yaml:"test-config-dir"` + // DaytimeIntent opts this reservation into the daytime human-access + // rotation (#1281, DC8) and picks the flavor stood up on it during the + // working day: "training" or "inference". Empty means the reservation is + // NOT part of the daytime rotation (nightly batch only). This is the + // configurable cloud→flavor default — data, not code — so the split + // (AWS=training, GCP=inference at launch) can change without a workflow edit. + DaytimeIntent string `yaml:"daytime-intent"` +} + +// DaytimeAssignment is one reservation's slot in the daytime human-access +// rotation: the reservation to lease and the intent (flavor) to stand up on +// it. The daytime scheduler (uat-daytime.yaml) consumes a JSON array of these +// as its dispatch matrix. +type DaytimeAssignment struct { + Reservation string `json:"reservation"` + Intent string `json:"intent"` } // Registry is the parsed reservations.yaml document. diff --git a/pkg/uatbroker/registry.go b/pkg/uatbroker/registry.go index 11addf55a..41d0a0da0 100644 --- a/pkg/uatbroker/registry.go +++ b/pkg/uatbroker/registry.go @@ -68,6 +68,10 @@ func (r *Registry) Validate() error { return errors.New(errors.ErrCodeInvalidRequest, "reservation registry has no reservations") } seen := make(map[string]bool, len(r.Reservations)) + // daytimeCloud tracks which reservation already claimed each cloud's daytime + // slot. At most one reservation per cloud may opt into the daytime rotation + // (see below). + daytimeCloud := make(map[string]string, len(r.Reservations)) for i := range r.Reservations { res := &r.Reservations[i] if strings.TrimSpace(res.Name) == "" { @@ -96,10 +100,51 @@ func (r *Registry) Validate() error { return errors.New(errors.ErrCodeInvalidRequest, fmt.Sprintf("reservation %s has a non-positive gpu-count (%d)", res.Name, res.GPUCount)) } + // daytime-intent is optional (empty = not in the daytime rotation), but + // when set it must be a recognized intent — a typo would otherwise + // silently drop the reservation from the daytime rotation or dispatch a + // nonexistent per-intent config. + if res.DaytimeIntent != "" && !validIntents[res.DaytimeIntent] { + return errors.New(errors.ErrCodeInvalidRequest, + fmt.Sprintf("reservation %s has unknown daytime-intent %q (want %s or %s, or empty to opt out)", + res.Name, res.DaytimeIntent, IntentTraining, IntentInference)) + } + // At most one daytime reservation per cloud: a single reservation cannot + // host both a held daytime cluster and the nightly batch at once, so two + // daytime reservations on one cloud would contend. Enforced here (not just + // in a test on the committed file) so every caller of ParseRegistry / + // LoadRegistryFile — future tooling, alternate registries — upholds the + // invariant the lease/scheduler relies on. + if res.DaytimeIntent != "" { + if prev, ok := daytimeCloud[res.Cloud]; ok { + return errors.New(errors.ErrCodeInvalidRequest, + fmt.Sprintf("cloud %s has more than one daytime-intent reservation (%s and %s); at most one is allowed", + res.Cloud, prev, res.Name)) + } + daytimeCloud[res.Cloud] = res.Name + } } return nil } +// DaytimeAssignments returns the daytime human-access rotation (#1281, DC8): +// one entry per reservation that opts in via a non-empty daytime-intent, in +// registry (document) order. Reservations with an empty daytime-intent are +// nightly-batch only and omitted. +func (r *Registry) DaytimeAssignments() []DaytimeAssignment { + out := make([]DaytimeAssignment, 0, len(r.Reservations)) + for i := range r.Reservations { + if r.Reservations[i].DaytimeIntent == "" { + continue + } + out = append(out, DaytimeAssignment{ + Reservation: r.Reservations[i].Name, + Intent: r.Reservations[i].DaytimeIntent, + }) + } + return out +} + // Lookup returns the reservation row with the given name, or an // ErrCodeNotFound error when no row matches. func (r *Registry) Lookup(name string) (*Reservation, error) { diff --git a/pkg/uatbroker/registry_test.go b/pkg/uatbroker/registry_test.go index deadf397e..07f0e3c40 100644 --- a/pkg/uatbroker/registry_test.go +++ b/pkg/uatbroker/registry_test.go @@ -143,6 +143,102 @@ reservations: wantErr: true, code: errors.ErrCodeInvalidRequest, }, + { + // Empty/absent daytime-intent is valid — the reservation is simply + // not in the daytime rotation. + name: "empty daytime-intent ok", + yaml: ` +reservations: + - name: aws-h100 + cloud: aws + reservation-id: cr-x + accelerator: h100 + gpu-count: 8 + cluster-config-path: c.yaml + test-config-dir: t +`, + }, + { + name: "valid daytime-intent", + yaml: ` +reservations: + - name: aws-h100 + cloud: aws + reservation-id: cr-x + accelerator: h100 + gpu-count: 8 + cluster-config-path: c.yaml + test-config-dir: t + daytime-intent: inference +`, + }, + { + name: "unknown daytime-intent", + yaml: ` +reservations: + - name: aws-h100 + cloud: aws + reservation-id: cr-x + accelerator: h100 + gpu-count: 8 + cluster-config-path: c.yaml + test-config-dir: t + daytime-intent: serving +`, + wantErr: true, + code: errors.ErrCodeInvalidRequest, + }, + { + // Two daytime reservations on one cloud would contend for the same + // reservation (a cloud cannot hold both a held daytime cluster and the + // nightly batch at once), so Validate must reject it. + name: "two daytime reservations same cloud", + yaml: ` +reservations: + - name: aws-h100 + cloud: aws + reservation-id: cr-x + accelerator: h100 + gpu-count: 8 + cluster-config-path: c.yaml + test-config-dir: t + daytime-intent: training + - name: aws-b200 + cloud: aws + reservation-id: cr-y + accelerator: b200 + gpu-count: 8 + cluster-config-path: c2.yaml + test-config-dir: t2 + daytime-intent: inference +`, + wantErr: true, + code: errors.ErrCodeInvalidRequest, + }, + { + // Two daytime reservations across DIFFERENT clouds is fine — that is + // exactly the launch topology (AWS training + GCP inference). + name: "daytime reservations across clouds ok", + yaml: ` +reservations: + - name: aws-h100 + cloud: aws + reservation-id: cr-x + accelerator: h100 + gpu-count: 8 + cluster-config-path: c.yaml + test-config-dir: t + daytime-intent: training + - name: gcp-h100 + cloud: gcp + reservation-id: projects/p/reservations/r + accelerator: h100 + gpu-count: 8 + cluster-config-path: c2.yaml + test-config-dir: t2 + daytime-intent: inference +`, + }, } for _, tt := range tests { @@ -229,6 +325,65 @@ func TestLoadRegistryFile(t *testing.T) { } } +func TestDaytimeAssignments(t *testing.T) { + // Only rows with a non-empty daytime-intent appear, in document order. + const yaml = ` +reservations: + - name: aws-h100 + cloud: aws + reservation-id: cr-x + accelerator: h100 + gpu-count: 8 + cluster-config-path: c.yaml + test-config-dir: t + daytime-intent: training + - name: gcp-h100 + cloud: gcp + reservation-id: projects/p/reservations/r + accelerator: h100 + gpu-count: 8 + cluster-config-path: c2.yaml + test-config-dir: t2 + daytime-intent: inference + - name: aws-b200 + cloud: aws + reservation-id: cr-y + accelerator: b200 + gpu-count: 8 + cluster-config-path: c3.yaml + test-config-dir: t3 +` + reg, err := ParseRegistry([]byte(yaml)) + if err != nil { + t.Fatalf("ParseRegistry: %v", err) + } + got := reg.DaytimeAssignments() + want := []DaytimeAssignment{ + {Reservation: "aws-h100", Intent: IntentTraining}, + {Reservation: "gcp-h100", Intent: IntentInference}, + } + if len(got) != len(want) { + t.Fatalf("DaytimeAssignments() = %+v, want %+v", got, want) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("DaytimeAssignments()[%d] = %+v, want %+v", i, got[i], want[i]) + } + } +} + +func TestDaytimeAssignmentsNone(t *testing.T) { + // A registry with no daytime-intent rows yields an empty (non-nil) slice — + // the scheduler's matrix is then empty and it dispatches nothing. + reg, err := ParseRegistry([]byte(validRegistryYAML)) + if err != nil { + t.Fatalf("ParseRegistry: %v", err) + } + if got := reg.DaytimeAssignments(); len(got) != 0 { + t.Errorf("DaytimeAssignments() = %+v, want empty", got) + } +} + // TestCommittedRegistryValid guards the actual checked-in registry: it must // parse, validate, and carry the two launch reservations. A bad data edit // fails here before it can break the broker workflows. @@ -248,4 +403,33 @@ func TestCommittedRegistryValid(t *testing.T) { t.Errorf("%q cloud = %q, want %q", name, res.Cloud, cloud) } } + + // The launch cloud→flavor split (#1281, DC8): AWS hosts training, GCP hosts + // inference. A future re-split changes these values here deliberately. + assignments := reg.DaytimeAssignments() + gotIntent := make(map[string]string, len(assignments)) + for _, a := range assignments { + gotIntent[a.Reservation] = a.Intent + } + wantIntent := map[string]string{"aws-h100": IntentTraining, "gcp-h100": IntentInference} + for name, intent := range wantIntent { + if gotIntent[name] != intent { + t.Errorf("committed registry daytime-intent[%q] = %q, want %q", name, gotIntent[name], intent) + } + } + // At most one daytime reservation per cloud: one reservation cannot host + // both a held daytime cluster and the nightly batch at once. + perCloud := make(map[string]int) + for _, a := range assignments { + res, lookupErr := reg.Lookup(a.Reservation) + if lookupErr != nil { + t.Fatalf("Lookup(%q): %v", a.Reservation, lookupErr) + } + perCloud[res.Cloud]++ + } + for cloud, n := range perCloud { + if n > 1 { + t.Errorf("cloud %q has %d daytime reservations, want at most 1 (both flavors per cloud is out of scope at launch)", cloud, n) + } + } } diff --git a/tools/uat-broker/README.md b/tools/uat-broker/README.md index 83ed2a0c5..033c7e487 100644 --- a/tools/uat-broker/README.md +++ b/tools/uat-broker/README.md @@ -27,6 +27,7 @@ uat-broker reservations --name aws-h100 >> "$GITHUB_OUTPUT" # gpu-count=8 # cluster-config-path=tests/uat/aws/cluster-config.yaml # test-config-dir=tests/uat/aws/tests +# daytime-intent=training ``` List every reservation name (one per line): @@ -35,6 +36,20 @@ List every reservation name (one per line): uat-broker reservations --list ``` +Print the daytime human-access rotation (#1281, DC8) as JSON — one +`{reservation, intent}` entry per row with a non-empty `daytime-intent`, +in document order — for the daytime scheduler's dispatch matrix: + +```sh +uat-broker reservations --daytime | jq -c . +# [{"reservation":"aws-h100","intent":"training"},{"reservation":"gcp-h100","intent":"inference"}] +``` + +The output is pretty-printed; the daytime scheduler compacts it with `jq -c` +into a one-line `strategy.matrix.include` array. + +`--name`, `--list`, and `--daytime` are mutually exclusive. + ### `schedule` Expand the ordered nightly version matrix as JSON — the tip-of-main cell diff --git a/tools/uat-broker/main.go b/tools/uat-broker/main.go index 6dc51d916..1fcefbbf0 100644 --- a/tools/uat-broker/main.go +++ b/tools/uat-broker/main.go @@ -27,6 +27,11 @@ // uat-broker reservations --list [--file ] // Print every reservation name, one per line. // +// uat-broker reservations --daytime [--file ] +// Print the daytime human-access rotation (#1281, DC8) as a JSON array +// of {reservation, intent} — one entry per row with a non-empty +// daytime-intent — for the daytime scheduler's dispatch matrix. +// // uat-broker schedule [--file ] [--reservations a,b] \ // [--previous-n N] [--include-main] < tags // Read candidate tags from stdin and print the ordered nightly @@ -63,6 +68,7 @@ const usage = `uat-broker — UAT reservation registry + nightly schedule helper Usage: uat-broker reservations --name [--file ] resolve one row as key=value lines uat-broker reservations --list [--file ] list reservation names + uat-broker reservations --daytime [--file ] print the daytime rotation as JSON [{reservation,intent}] uat-broker schedule [--file ] [--reservations a,b] [--previous-n N] [--include-main] expand the nightly version matrix (tags on stdin) as JSON` @@ -118,13 +124,15 @@ func runReservations(args []string, stdout, stderr io.Writer) error { fs := flag.NewFlagSet("reservations", flag.ContinueOnError) fs.SetOutput(stderr) var ( - file string - name string - list bool + file string + name string + list bool + daytime bool ) fs.StringVar(&file, "file", defaultRegistryPath, "path to the reservation registry") fs.StringVar(&name, "name", "", "reservation name to resolve") fs.BoolVar(&list, "list", false, "list all reservation names, one per line") + fs.BoolVar(&daytime, "daytime", false, "print the daytime human-access rotation as JSON: [{\"reservation\",\"intent\"}]") if err := fs.Parse(args); err != nil { return flagParseErr(err, "reservations") } @@ -132,26 +140,42 @@ func runReservations(args []string, stdout, stderr io.Writer) error { return errors.New(errors.ErrCodeInvalidRequest, "reservations: unexpected positional arguments: "+strings.Join(fs.Args(), " ")) } + // The three output modes are mutually exclusive: --name resolves one row, + // --list prints names, --daytime prints the daytime rotation. + if selected := boolCount(list, daytime, name != ""); selected > 1 { + return errors.New(errors.ErrCodeInvalidRequest, + "reservations: --name, --list, and --daytime are mutually exclusive") + } reg, err := uatbroker.LoadRegistryFile(file) if err != nil { return err } + // The daytime rotation is JSON (a dispatch matrix), written directly by the + // encoder; the other two modes are line-oriented. Handle JSON first so its + // output path is unambiguous. + if daytime { + enc := json.NewEncoder(stdout) + enc.SetIndent("", " ") + if err := enc.Encode(reg.DaytimeAssignments()); err != nil { + return errors.Wrap(errors.ErrCodeInternal, "encode daytime assignments", err) + } + return nil + } + // Build the output first, then write it in one checked call so a broken // pipe or an unwritable $GITHUB_OUTPUT surfaces as a failure instead of a // silent exit-0 that leaves downstream jobs without their inputs. Writes to // the strings.Builder cannot fail. var b strings.Builder switch { - case list && name != "": - return errors.New(errors.ErrCodeInvalidRequest, "reservations: --list and --name are mutually exclusive") case list: for _, n := range reg.Names() { fmt.Fprintln(&b, n) } case name == "": - return errors.New(errors.ErrCodeInvalidRequest, "reservations: --name is required (or pass --list)") + return errors.New(errors.ErrCodeInvalidRequest, "reservations: --name is required (or pass --list or --daytime)") default: res, lookupErr := reg.Lookup(name) if lookupErr != nil { @@ -164,6 +188,9 @@ func runReservations(args []string, stdout, stderr io.Writer) error { fmt.Fprintf(&b, "gpu-count=%d\n", res.GPUCount) fmt.Fprintf(&b, "cluster-config-path=%s\n", res.ClusterConfigPath) fmt.Fprintf(&b, "test-config-dir=%s\n", res.TestConfigDir) + // Empty when the reservation is not in the daytime rotation; callers + // that don't consume this key simply ignore the line. + fmt.Fprintf(&b, "daytime-intent=%s\n", res.DaytimeIntent) } if _, err := io.WriteString(stdout, b.String()); err != nil { @@ -317,6 +344,18 @@ func scheduleIsEmpty(schedule map[string][]uatbroker.Cell) bool { return true } +// boolCount returns how many of the given flags are true — used to enforce +// mutual exclusivity of a set of mode flags. +func boolCount(flags ...bool) int { + n := 0 + for _, f := range flags { + if f { + n++ + } + } + return n +} + // splitCSV splits a comma-separated list, trimming whitespace and dropping // empties. func splitCSV(csv string) []string { diff --git a/tools/uat-broker/main_test.go b/tools/uat-broker/main_test.go index 2d1ccbf0c..14a31516f 100644 --- a/tools/uat-broker/main_test.go +++ b/tools/uat-broker/main_test.go @@ -37,6 +37,7 @@ reservations: gpu-count: 8 cluster-config-path: tests/uat/aws/cluster-config.yaml test-config-dir: tests/uat/aws/tests + daytime-intent: training - name: gcp-h100 cloud: gcp reservation-id: projects/p/reservations/r @@ -44,6 +45,7 @@ reservations: gpu-count: 8 cluster-config-path: tests/uat/gcp/cluster-config.yaml test-config-dir: tests/uat/gcp/tests + daytime-intent: inference ` func writeRegistry(t *testing.T) string { @@ -93,6 +95,30 @@ func TestReservationsList(t *testing.T) { } } +func TestReservationsDaytime(t *testing.T) { + reg := writeRegistry(t) + code, stdout, stderr := invoke("", "reservations", "--file", reg, "--daytime") + if code != 0 { + t.Fatalf("exit code = %d (stderr: %s)", code, stderr) + } + var got []uatbroker.DaytimeAssignment + if err := json.Unmarshal([]byte(stdout), &got); err != nil { + t.Fatalf("--daytime output is not valid JSON: %v\ngot:\n%s", err, stdout) + } + want := []uatbroker.DaytimeAssignment{ + {Reservation: "aws-h100", Intent: "training"}, + {Reservation: "gcp-h100", Intent: "inference"}, + } + if len(got) != len(want) { + t.Fatalf("--daytime = %+v, want %+v", got, want) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("--daytime[%d] = %+v, want %+v", i, got[i], want[i]) + } + } +} + func TestReservationsExitCodes(t *testing.T) { reg := writeRegistry(t) tests := []struct { @@ -105,6 +131,8 @@ func TestReservationsExitCodes(t *testing.T) { {"unknown registry file", []string{"reservations", "--file", "/no/such.yaml", "--name", "aws-h100"}, errors.ExitInvalidInput}, {"bad flag", []string{"reservations", "--bogus"}, errors.ExitInvalidInput}, {"list and name conflict", []string{"reservations", "--file", reg, "--list", "--name", "aws-h100"}, errors.ExitInvalidInput}, + {"daytime and name conflict", []string{"reservations", "--file", reg, "--daytime", "--name", "aws-h100"}, errors.ExitInvalidInput}, + {"daytime and list conflict", []string{"reservations", "--file", reg, "--daytime", "--list"}, errors.ExitInvalidInput}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {