From a0c2adfb43938ef9bc871d33f5c6b97a881e8a9c Mon Sep 17 00:00:00 2001 From: Shayan Namaghi Date: Tue, 26 May 2026 11:45:42 -0700 Subject: [PATCH] feat(helm): add Flow (RLA) helm chart and prereqs wiring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the Flow rack lifecycle orchestrator (formerly RLA) helm chart at helm/charts/nico-flow/, alongside the rest of the NICo subcharts. Flow runs as a single pod with three gRPC containers — flow (50051), psm (50052), nsm (50053) — sharing a SPIFFE cert and communicating over headless Services that DNS-resolve to the pod IP. This mirrors the upstream forged deployment. Flow ships as a STANDALONE Helm release (release name "flow", namespace "flow"), NOT as part of `helm install nico ./helm`. The nico umbrella declares it as a conditional dependency with nico-flow.enabled defaulted to false in helm/values.yaml — this keeps the chart in its conventional helm/charts/ location while preventing Helm v3+ from auto-rendering it into the nico release (where it would conflict with nico-prereqs over the nico-system namespace). Wires per-component dependencies into helm-prereqs so ./setup.sh brings flow up end-to-end: - postgresql.yaml: provisions flow/psm/nsm databases and roles on nico-pg-cluster - eso-external-secrets.yaml: ClusterExternalSecrets sync the per-service DB credentials into the flow namespace - flow-vault-tokens-job.yaml (new): post-install hook mints scoped Vault tokens for psm/nsm and writes them as Secrets in the flow ns - values.yaml: new flow.enabled / flow.namespace toggles; flips vault.nicoApiK8sAuth.enabled=true (carbide-api requires the role) setup.sh phase 7i deploys the chart with a pre-apply Certificate dance to avoid cert-manager / FailedMount races, waits for vault tokens and ESO DB-cred syncs, then helm upgrade --installs flow. clean.sh and health-check.sh updated to cover the new namespace and resources. The nico-flow Namespace template carries helm.sh/resource-policy: keep so uninstalling flow does not wipe the prereqs-managed secrets that live in the namespace. --- helm-prereqs/clean.sh | 22 +- helm-prereqs/health-check.sh | 18 ++ helm-prereqs/setup.sh | 159 ++++++++++++- .../templates/eso-external-secrets.yaml | 34 +++ .../templates/flow-vault-tokens-job.yaml | 201 ++++++++++++++++ helm-prereqs/templates/postgresql.yaml | 16 ++ helm-prereqs/values.yaml | 27 ++- helm/Chart.yaml | 3 + helm/charts/nico-flow/Chart.yaml | 13 ++ helm/charts/nico-flow/templates/_helpers.tpl | 90 ++++++++ .../nico-flow/templates/certificate.yaml | 44 ++++ .../charts/nico-flow/templates/configmap.yaml | 12 + .../nico-flow/templates/deployment.yaml | 216 ++++++++++++++++++ .../templates/image-pull-secret.yaml | 28 +++ .../charts/nico-flow/templates/namespace.yaml | 13 ++ helm/charts/nico-flow/templates/rbac.yaml | 27 +++ .../nico-flow/templates/service-account.yaml | 8 + .../nico-flow/templates/service-grpc.yaml | 55 +++++ helm/charts/nico-flow/values.yaml | 203 ++++++++++++++++ helm/values.yaml | 12 + 20 files changed, 1189 insertions(+), 12 deletions(-) create mode 100644 helm-prereqs/templates/flow-vault-tokens-job.yaml create mode 100644 helm/charts/nico-flow/Chart.yaml create mode 100644 helm/charts/nico-flow/templates/_helpers.tpl create mode 100644 helm/charts/nico-flow/templates/certificate.yaml create mode 100644 helm/charts/nico-flow/templates/configmap.yaml create mode 100644 helm/charts/nico-flow/templates/deployment.yaml create mode 100644 helm/charts/nico-flow/templates/image-pull-secret.yaml create mode 100644 helm/charts/nico-flow/templates/namespace.yaml create mode 100644 helm/charts/nico-flow/templates/rbac.yaml create mode 100644 helm/charts/nico-flow/templates/service-account.yaml create mode 100644 helm/charts/nico-flow/templates/service-grpc.yaml create mode 100644 helm/charts/nico-flow/values.yaml diff --git a/helm-prereqs/clean.sh b/helm-prereqs/clean.sh index ef79427db5..cf00bf33a7 100755 --- a/helm-prereqs/clean.sh +++ b/helm-prereqs/clean.sh @@ -38,8 +38,12 @@ cd "${SCRIPT_DIR}" # cert-manager and ClusterIssuers. # --------------------------------------------------------------------------- echo "=== [0/8] Uninstalling NICo REST stack ===" -helm uninstall nico-rest-site-agent -n nico-rest 2>/dev/null || true -helm uninstall nico-rest -n nico-rest 2>/dev/null || true +# Flow goes first — it talks to Temporal + nico-api and depends on credentials +# from both nico-prereqs (DB creds, vault tokens) and the REST stack. +helm uninstall flow -n flow 2>/dev/null || true +kubectl delete ns flow --wait=false --ignore-not-found 2>/dev/null || true +helm uninstall nico-rest-site-agent -n nico-rest 2>/dev/null || true +helm uninstall nico-rest -n nico-rest 2>/dev/null || true helm uninstall temporal -n temporal 2>/dev/null || true if kubectl get deploy keycloak -n nico-rest &>/dev/null; then @@ -50,10 +54,10 @@ else fi kubectl delete clusterissuer nico-rest-ca-issuer --ignore-not-found 2>/dev/null || true -kubectl delete ns nico-rest temporal \ +kubectl delete ns nico-rest temporal flow \ --wait=false --ignore-not-found 2>/dev/null || true -echo "Waiting for nico-rest and temporal namespaces to terminate..." -kubectl wait --for=delete ns/nico-rest ns/temporal \ +echo "Waiting for nico-rest, temporal, and flow namespaces to terminate..." +kubectl wait --for=delete ns/nico-rest ns/temporal ns/flow \ --timeout=120s 2>/dev/null || true # --------------------------------------------------------------------------- @@ -153,13 +157,15 @@ kubectl delete clusterissuer \ kubectl delete clustersecretstore \ cert-manager-ns-secretstore postgres-ns-secretstore \ --ignore-not-found 2>/dev/null || true -kubectl delete clusterexternalsecret nico-roots-eso nico-db-eso \ +kubectl delete clusterexternalsecret \ + nico-roots-eso nico-db-eso \ + flow-db-eso psm-db-eso nsm-db-eso \ --ignore-not-found 2>/dev/null || true kubectl delete clusterrole \ - vault-pki-config-reader eso-postgres-ns-role \ + vault-pki-config-reader eso-postgres-ns-role flow-vault-tokens-writer \ --ignore-not-found 2>/dev/null || true kubectl delete clusterrolebinding \ - vault-pki-config-reader eso-postgres-ns-rolebinding \ + vault-pki-config-reader eso-postgres-ns-rolebinding flow-vault-tokens-writer \ --ignore-not-found 2>/dev/null || true # --------------------------------------------------------------------------- diff --git a/helm-prereqs/health-check.sh b/helm-prereqs/health-check.sh index c0b5f50203..99e14b13a2 100755 --- a/helm-prereqs/health-check.sh +++ b/helm-prereqs/health-check.sh @@ -334,8 +334,26 @@ for _OPT_DEP in nico-hardware-health nico-ssh-console-rs nico-dsx-exchange-consu fi done +section "NICo Flow" +FLOW_NS="${FLOW_NS:-flow}" +if kc get ns "${FLOW_NS}" &>/dev/null; then + _check_deployment "${FLOW_NS}" flow + for _S in psm-vault-token nsm-vault-token \ + flow.nico.nico-pg-cluster.credentials \ + psm.nico.nico-pg-cluster.credentials \ + nsm.nico.nico-pg-cluster.credentials \ + flow-certificate temporal-client-certs nico-roots; do + _check_secret_exists "${FLOW_NS}" "${_S}" + done +else + skip "flow namespace not present — flow disabled or not yet deployed" +fi + section "NICo Jobs" _check_job_complete "${NICO_NS}" vault-pki-config +if kc get job -n "${NICO_NS}" flow-vault-tokens &>/dev/null; then + _check_job_complete "${NICO_NS}" flow-vault-tokens +fi # Migration job: find by label (name includes a random suffix) _MIG_JOB=$(kc get jobs -n "${NICO_NS}" -l 'app.kubernetes.io/name=nico-api-migrate' \ diff --git a/helm-prereqs/setup.sh b/helm-prereqs/setup.sh index 513eb149f7..d3ae8cc351 100755 --- a/helm-prereqs/setup.sh +++ b/helm-prereqs/setup.sh @@ -58,6 +58,9 @@ # ./setup.sh -y # skip all prompts, deploy everything automatically # ./setup.sh --skip-core # skip Phase 6 NICo Core (print command, deploy manually) # ./setup.sh --skip-rest # skip Phase 7 NICo REST entirely (no repo needed) +# ./setup.sh --skip-flow # skip Phase 7i NICo Flow (REST still installs) +# # pair with helm-prereqs/values.yaml::flow.enabled=false +# # to skip Flow prereqs (DBs / ESO / vault tokens) too # ./setup.sh --skip-core --skip-rest # fully non-interactive infra-only run # ./setup.sh --core-values /path/to/values.yaml # use site-specific values for Phase 6 # ./setup.sh --metallb-config /path/to/metallb.yaml # use site-specific MetalLB config (file or kustomize dir) @@ -78,6 +81,7 @@ cd "${SCRIPT_DIR}" AUTO_YES=false SKIP_CORE=false SKIP_REST=false +SKIP_FLOW=false CORE_VALUES="" METALLB_CONFIG="" SITE_OVERLAY="" @@ -86,6 +90,7 @@ while [[ $# -gt 0 ]]; do -y) AUTO_YES=true ;; --skip-core) SKIP_CORE=true ;; --skip-rest) SKIP_REST=true ;; + --skip-flow) SKIP_FLOW=true ;; --debug) set -x ;; --core-values) [[ -z "${2:-}" ]] && { echo "Error: --core-values requires a file path"; exit 1; } @@ -102,7 +107,7 @@ while [[ $# -gt 0 ]]; do SITE_OVERLAY="$(cd "$(dirname "$2")" && pwd)/$(basename "$2")" [[ ! -d "${SITE_OVERLAY}" ]] && { echo "Error: --site-overlay directory not found: $2"; exit 1; } shift ;; - *) echo "Usage: $0 [-y] [--skip-core] [--skip-rest] [--core-values ] [--metallb-config ] [--site-overlay ] [--debug]"; exit 1 ;; + *) echo "Usage: $0 [-y] [--skip-core] [--skip-rest] [--skip-flow] [--core-values ] [--metallb-config ] [--site-overlay ] [--debug]"; exit 1 ;; esac shift done @@ -111,7 +116,7 @@ done # Pre-flight checks — env vars, tools, config files, NICo REST repo # Exports NICO_REST_REPO if resolved. Exits 1 if user declines to continue. # --------------------------------------------------------------------------- -export AUTO_YES SKIP_CORE SKIP_REST +export AUTO_YES SKIP_CORE SKIP_REST SKIP_FLOW # shellcheck source=preflight.sh source "${SCRIPT_DIR}/preflight.sh" @@ -591,6 +596,9 @@ kubectl exec -n temporal deploy/temporal-admintools -- \ sh -c "temporal operator namespace create -n cloud --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true kubectl exec -n temporal deploy/temporal-admintools -- \ sh -c "temporal operator namespace create -n site --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true +# flow Temporal namespace — required by NICo Flow workers; pod panics on startup if absent. +kubectl exec -n temporal deploy/temporal-admintools -- \ + sh -c "temporal operator namespace create -n flow --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true echo "Temporal namespaces ready" _SETUP_PHASE="[7g/7] NICo REST helm chart" @@ -721,13 +729,25 @@ kubectl exec -n temporal deploy/temporal-admintools -- \ sh -c "temporal operator namespace create -n '${NICO_SITE_UUID}' --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true echo "Temporal namespace ready" +# FLOW_GRPC_ENABLED toggles the site-agent's Flow gRPC client (see +# carbide-rest/site-agent/pkg/components/config/config_manager.go — +# strings.ToLower(env)=="true"). Without it, site-agent never opens a +# connection to the Flow pod deployed in phase 7i. We default it ON when +# Flow itself is being deployed; users can flip it back via --set when +# pairing --skip-flow. +_FLOW_GRPC_ENABLED="true" +if "${SKIP_FLOW}"; then + _FLOW_GRPC_ENABLED="false" +fi + helm upgrade --install nico-rest-site-agent "${NICO_SITE_AGENT_CHART}" \ "${NICO_SITE_AGENT_ARGS[@]}" \ --set "envConfig.CLUSTER_ID=${NICO_SITE_UUID}" \ --set "envConfig.TEMPORAL_SUBSCRIBE_NAMESPACE=${NICO_SITE_UUID}" \ --set "envConfig.TEMPORAL_SUBSCRIBE_QUEUE=site" \ + --set "envConfig.FLOW_GRPC_ENABLED=${_FLOW_GRPC_ENABLED}" \ --timeout 300s --wait -echo "NICo REST site-agent deployed and bootstrap complete" +echo "NICo REST site-agent deployed and bootstrap complete (FLOW_GRPC_ENABLED=${_FLOW_GRPC_ENABLED})" # Verify the site-agent's gRPC connection to NICo Core succeeded. The site-agent attempts # the connection exactly once at startup with a 5-second deadline; if it @@ -758,6 +778,139 @@ if [ "${_CONNECTED}" = "false" ]; then echo "Site-agent pod restarted — gRPC connection will be retried" fi +# --- 7i. NICo Flow ------------------------------------------------------------ +# Flow is the rack lifecycle orchestrator (formerly RLA). Single pod with three +# containers — flow (50051), psm (50052), nsm (50053). Runs in its own `flow` +# namespace. +# +# Prerequisites already in place by this point: +# - flow/psm/nsm databases on nico-pg-cluster (helm-prereqs postgresql.yaml) +# - flow.nico/psm.nico/nsm.nico DB credentials synced via ESO into the flow +# namespace by the flow-db-eso / psm-db-eso / nsm-db-eso ClusterExternalSecrets +# - psm-vault-token and nsm-vault-token Secrets in the flow namespace +# (provisioned by the flow-vault-tokens post-install hook) +# - Temporal `flow` namespace (created in phase 7f above) +# - nico-rest-ca-issuer ClusterIssuer (installed by phase 7b — issues the +# temporal-client-certs) +# - vault-nico-issuer ClusterIssuer (issues the SPIFFE cert) +# +# Same pre-apply-cert dance as the site-agent: render the Certificate(s) ahead +# of the helm install so cert-manager has time to issue them and the pod doesn't +# hit a FailedMount race on the spiffe / temporal-client-certs secrets. +if "${SKIP_FLOW}"; then + echo "=== [7i/7] NICo Flow — skipped (--skip-flow) ===" + _SETUP_PHASE="complete" + exit 0 +fi +_SETUP_PHASE="[7i/7] NICo Flow" +echo "=== [7i/7] NICo Flow ===" + +NICO_FLOW_CHART="${SCRIPT_DIR}/../helm/charts/nico-flow" +NICO_FLOW_NAMESPACE="flow" + +NICO_FLOW_ARGS=( + --namespace "${NICO_FLOW_NAMESPACE}" + --create-namespace + --set "global.image.repository=${NICO_IMAGE_REGISTRY}" + ## Flow (nico-flow / nico-psm / nico-nsm) ships on the same image release + ## line as NICo REST — they're built and tagged together — so reuse + ## NICO_REST_IMAGE_TAG, not NICO_CORE_IMAGE_TAG (which is carbide-api). + --set "global.image.tag=${NICO_REST_IMAGE_TAG}" +) + +# Render the dockerconfigjson for the chart-managed image-pull-secret. Same +# pattern as the NICo REST common chart — keep the registry credential on +# the helm command line so the chart template can install it as a +# pre-install hook (pod can't pull from nvcr.io otherwise). +if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then + _flow_registry_server="${NICO_IMAGE_REGISTRY%%/*}" + _flow_docker_cfg="$(printf '{"auths":{"%s":{"username":"%s","password":"%s"}}}' \ + "${_flow_registry_server}" \ + "${REGISTRY_PULL_USERNAME:-\$oauthtoken}" \ + "${REGISTRY_PULL_SECRET}" | base64 | tr -d '\n')" + NICO_FLOW_ARGS+=( + --set "global.imagePullSecrets[0].name=image-pull-secret" + --set "imagePullSecret.dockerconfigjson=${_flow_docker_cfg}" + ) +fi + +# Pre-apply Certificates so cert-manager can issue secrets before the pod schedules. +echo "Pre-applying flow Certificates (SPIFFE + Temporal client)..." +helm template flow "${NICO_FLOW_CHART}" \ + "${NICO_FLOW_ARGS[@]}" \ + --show-only templates/namespace.yaml | kubectl apply -f - +helm template flow "${NICO_FLOW_CHART}" \ + "${NICO_FLOW_ARGS[@]}" \ + --show-only templates/certificate.yaml | kubectl apply -f - +kubectl annotate certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ + "meta.helm.sh/release-name=flow" \ + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite +kubectl annotate certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ + "meta.helm.sh/release-name=flow" \ + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite +kubectl label certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ + "app.kubernetes.io/managed-by=Helm" --overwrite +kubectl label certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ + "app.kubernetes.io/managed-by=Helm" --overwrite + +# Annotate/label the namespace itself — the flow-vault-tokens-job (nico-prereqs +# helm hook) creates this namespace ahead of the flow release. Without Helm +# ownership metadata, helm install refuses to adopt it. +kubectl annotate namespace "${NICO_FLOW_NAMESPACE}" \ + "meta.helm.sh/release-name=flow" \ + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite +kubectl label namespace "${NICO_FLOW_NAMESPACE}" \ + "app.kubernetes.io/managed-by=Helm" --overwrite + +echo "Waiting for cert-manager to issue flow-certificate..." +kubectl wait --for=condition=Ready certificate/flow-certificate \ + -n "${NICO_FLOW_NAMESPACE}" --timeout=120s +echo "Waiting for cert-manager to issue temporal-client-certs..." +kubectl wait --for=condition=Ready certificate/temporal-client-certs \ + -n "${NICO_FLOW_NAMESPACE}" --timeout=120s + +# Wait for the psm/nsm vault tokens and DB credential ESO syncs to land +# (provisioned by helm-prereqs hooks; may still be in flight if nico-prereqs +# was re-installed just before this phase). Fail-fast if any secret never +# shows up — the alternative (silently falling through to helm install) is +# 5 minutes of FailedMount-loop before helm gives up with an opaque message. +_wait_for_secret() { + local _name="$1" + local _ns="$2" + local _hint="$3" + for _i in $(seq 1 24); do + if kubectl get secret "${_name}" -n "${_ns}" >/dev/null 2>&1; then + echo " ${_name} ready" + return 0 + fi + echo " Waiting for ${_name} (${_i}/24)..." + sleep 5 + done + echo "ERROR: Secret ${_name} did not appear in namespace ${_ns} within 120s." + echo " ${_hint}" + return 1 +} + +echo "Waiting for psm/nsm Vault tokens..." +for _s in psm-vault-token nsm-vault-token; do + _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ + "Provisioned by the flow-vault-tokens helm hook in nico-prereqs. Check 'kubectl logs -n nico-system job/flow-vault-tokens' and confirm helm-prereqs/values.yaml::flow.enabled=true." +done + +echo "Waiting for flow/psm/nsm DB credentials..." +for _s in flow.nico.nico-pg-cluster.credentials \ + psm.nico.nico-pg-cluster.credentials \ + nsm.nico.nico-pg-cluster.credentials; do + _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ + "Synced by the flow-db-eso/psm-db-eso/nsm-db-eso ClusterExternalSecrets in nico-prereqs. Check 'kubectl describe clusterexternalsecret -A | grep flow' and confirm helm-prereqs/values.yaml::flow.enabled=true." +done + +echo "Installing flow helm chart..." +helm upgrade --install flow "${NICO_FLOW_CHART}" \ + "${NICO_FLOW_ARGS[@]}" \ + --timeout 300s --wait +echo "NICo Flow deployed" + echo "" echo "=========================================================================" echo " Setup complete" diff --git a/helm-prereqs/templates/eso-external-secrets.yaml b/helm-prereqs/templates/eso-external-secrets.yaml index afffa7c821..51c1ec7825 100644 --- a/helm-prereqs/templates/eso-external-secrets.yaml +++ b/helm-prereqs/templates/eso-external-secrets.yaml @@ -66,4 +66,38 @@ spec: key: nico-system.nico.nico-pg-cluster.credentials.postgresql.acid.zalan.do version: v1 {{- end }} + +{{/* + flow/psm/nsm DB credential syncs. + The Zalando operator generates one Secret per (user, cluster) pair in the + postgres namespace; ESO extracts each and projects it into the flow + namespace under the short name the flow chart consumes. +*/}} +{{- if and .Values.postgresql.enabled .Values.flow.enabled }} +{{- range $svc := list "flow" "psm" "nsm" }} +--- +apiVersion: external-secrets.io/v1beta1 +kind: ClusterExternalSecret +metadata: + name: {{ printf "%s-db-eso" $svc }} +spec: + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: In + values: ["{{ $.Values.flow.namespace }}"] + refreshTime: 30s + externalSecretSpec: + secretStoreRef: + name: postgres-ns-secretstore + kind: ClusterSecretStore + target: + name: {{ printf "%s.nico.nico-pg-cluster.credentials" $svc }} + deletionPolicy: Retain + dataFrom: + - extract: + key: {{ printf "%s.nico.nico-pg-cluster.credentials.postgresql.acid.zalan.do" $svc }} + version: v1 +{{- end }} +{{- end }} {{- end }} diff --git a/helm-prereqs/templates/flow-vault-tokens-job.yaml b/helm-prereqs/templates/flow-vault-tokens-job.yaml new file mode 100644 index 0000000000..020be16f2f --- /dev/null +++ b/helm-prereqs/templates/flow-vault-tokens-job.yaml @@ -0,0 +1,201 @@ +{{- if and .Values.flow.enabled .Values.vault.configJob.enabled }} +## ============================================================================= +## Flow Vault Tokens — psm-vault-token, nsm-vault-token +## ============================================================================= +## PSM and NSM containers in the flow pod authenticate to Vault with a +## periodic token (VAULT_TOKEN env var). Forged provisions these out-of-band; +## here we mint them with a scoped policy as a post-install Helm hook so +## `./setup.sh` brings flow up end-to-end with no manual secret juggling. +## +## Runs after the main vault-pki-config job (hook-weight: 20 > 10). +## Mints a periodic Vault token per service (period=24h; the token never +## expires as long as it's renewed within that window) with read/write on +## its own KV subtree, then writes the token into a Secret in the flow +## namespace. +## ============================================================================= +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flow-vault-tokens-sa + namespace: {{ .Values.namespace }} + annotations: + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "15" + helm.sh/hook-delete-policy: before-hook-creation +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: flow-vault-tokens-writer + annotations: + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "15" + helm.sh/hook-delete-policy: before-hook-creation +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "create", "patch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "create", "update", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: flow-vault-tokens-writer + annotations: + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "15" + helm.sh/hook-delete-policy: before-hook-creation +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flow-vault-tokens-writer +subjects: + - kind: ServiceAccount + name: flow-vault-tokens-sa + namespace: {{ .Values.namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: flow-vault-tokens + namespace: {{ .Values.namespace }} + annotations: + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "20" + helm.sh/hook-delete-policy: before-hook-creation +spec: + backoffLimit: 2 + activeDeadlineSeconds: 600 + template: + spec: + serviceAccountName: flow-vault-tokens-sa + restartPolicy: Never + initContainers: + - name: wait-vault + image: {{ .Values.vault.configJob.vaultImage | quote }} + env: + - name: VAULT_ADDR + value: {{ .Values.vault.address | quote }} + - name: VAULT_SKIP_VERIFY + value: "true" + command: + - sh + - -c + - | + echo "Waiting for Vault PKI config job to finish (policies + auth methods need to exist)..." + until vault status -tls-skip-verify -format=json 2>/dev/null \ + | grep -q '"sealed":[[:space:]]*false'; do + echo " Vault not ready, retrying in 5s..." + sleep 5 + done + echo "Vault ready" + volumes: + - name: tokens + emptyDir: {} + containers: + ## --------------------------------------------------------------- + ## Mint per-service Vault tokens, write them to /tokens. + ## --------------------------------------------------------------- + - name: mint + image: {{ .Values.vault.configJob.vaultImage | quote }} + env: + - name: VAULT_ADDR + value: {{ .Values.vault.address | quote }} + - name: VAULT_SKIP_VERIFY + value: "true" + - name: VAULT_TOKEN + valueFrom: + secretKeyRef: + name: nico-vault-token + key: token + volumeMounts: + - name: tokens + mountPath: /tokens + command: + - /bin/sh + - -c + - | + set -e + + # PSM policy: read/write on its own KV subtree + PKI sign for SPIFFE. + echo "Writing psm-vault-policy..." + vault policy write psm-vault-policy - <<'POLICY' + path "{{ .Values.vault.kvMount }}/data/psm/*" { + capabilities = ["create", "read", "update", "delete", "list"] + } + path "{{ .Values.vault.kvMount }}/metadata/psm/*" { + capabilities = ["read", "list", "delete"] + } + path "{{ .Values.vault.pkiMount }}/sign/{{ .Values.vault.pkiRole }}" { + capabilities = ["create", "update"] + } + POLICY + + echo "Writing nsm-vault-policy..." + vault policy write nsm-vault-policy - <<'POLICY' + path "{{ .Values.vault.kvMount }}/data/nsm/*" { + capabilities = ["create", "read", "update", "delete", "list"] + } + path "{{ .Values.vault.kvMount }}/metadata/nsm/*" { + capabilities = ["read", "list", "delete"] + } + path "{{ .Values.vault.pkiMount }}/sign/{{ .Values.vault.pkiRole }}" { + capabilities = ["create", "update"] + } + POLICY + + # Mint periodic tokens (renewable every 24h, never expire while renewed). + echo "Minting psm token..." + vault token create \ + -policy=psm-vault-policy \ + -period=24h \ + -display-name=psm-vault-token \ + -field=token > /tokens/psm-token + + echo "Minting nsm token..." + vault token create \ + -policy=nsm-vault-policy \ + -period=24h \ + -display-name=nsm-vault-token \ + -field=token > /tokens/nsm-token + + touch /tokens/done + echo "Tokens minted" + ## --------------------------------------------------------------- + ## Write tokens into k8s Secrets in the flow namespace. + ## --------------------------------------------------------------- + - name: write-secrets + image: {{ .Values.vault.configJob.kubectlImage | quote }} + volumeMounts: + - name: tokens + mountPath: /tokens + command: + - /bin/sh + - -c + - | + set -e + echo "Waiting for mint container..." + until [ -f /tokens/done ]; do sleep 2; done + + # Ensure flow namespace exists and is labeled for ESO sync. + kubectl create namespace {{ .Values.flow.namespace }} --dry-run=client -o yaml \ + | kubectl apply -f - + kubectl label namespace {{ .Values.flow.namespace }} \ + nico.nvidia.com/managed=true --overwrite + + PSM_TOKEN=$(cat /tokens/psm-token) + NSM_TOKEN=$(cat /tokens/nsm-token) + + for pair in "psm-vault-token:$PSM_TOKEN" "nsm-vault-token:$NSM_TOKEN"; do + name=${pair%%:*} + value=${pair#*:} + kubectl create secret generic "$name" \ + -n {{ .Values.flow.namespace }} \ + --from-literal=token="$value" \ + --dry-run=client -o yaml | kubectl apply -f - + done + echo "psm-vault-token and nsm-vault-token written to {{ .Values.flow.namespace }} namespace" +{{- end }} diff --git a/helm-prereqs/templates/postgresql.yaml b/helm-prereqs/templates/postgresql.yaml index d8e2a39611..31dafdee08 100644 --- a/helm-prereqs/templates/postgresql.yaml +++ b/helm-prereqs/templates/postgresql.yaml @@ -45,8 +45,24 @@ spec: nico-system.nico: - superuser - createdb + {{- if .Values.flow.enabled }} + flow.nico: + - superuser + - createdb + psm.nico: + - superuser + - createdb + nsm.nico: + - superuser + - createdb + {{- end }} databases: nico_system_nico: nico-system.nico + {{- if .Values.flow.enabled }} + flow: flow.nico + psm: psm.nico + nsm: nsm.nico + {{- end }} patroni: synchronous_mode: true synchronous_mode_strict: true diff --git a/helm-prereqs/values.yaml b/helm-prereqs/values.yaml index cbbd9ef3cd..8e45d3c4d6 100644 --- a/helm-prereqs/values.yaml +++ b/helm-prereqs/values.yaml @@ -53,8 +53,10 @@ vault: organization: "" ## Kubernetes auth role for nico-api to read and manage Vault KV ## credentials used by SiteExplorer and related credential flows. + ## Required by the carbide-api binary — without this role, SiteExplorer + ## cannot authenticate to Vault and BMC credential lookups fail every cycle. nicoApiK8sAuth: - enabled: false + enabled: true serviceAccountName: "nico-api" tokenTTL: "1h" kvMount: "secrets" @@ -100,6 +102,29 @@ imagePullSecrets: ngcCarbidePull: "" # nvcr.io/0837451325059433/carbide-dev ngcNvidianKey: "" # nvcr.io/nvidian/nvforge (optional, for unbound) +## --------------------------------------------------------------------------- +## flow — NICo Flow (rack lifecycle orchestrator, formerly RLA). +## +## This toggle controls only the helm-prereqs side of Flow: +## - per-component databases (flow / psm / nsm) on nico-pg-cluster +## - ClusterExternalSecrets that sync those DB credentials into the flow +## namespace (eso-external-secrets.yaml) +## - psm/nsm Vault tokens (flow-vault-tokens-job post-install hook) +## +## The flow Deployment itself ships as a separate helm release from +## helm/charts/nico-flow, installed by setup.sh phase 7i. That phase is NOT +## driven by this value — it is unconditionally invoked unless setup.sh is +## passed --skip-flow. If you set flow.enabled=false here you should also +## pass --skip-flow on the setup.sh command line, otherwise phase 7i will +## try to install a chart whose prerequisites (DB creds, Vault tokens) are +## missing and fail. +## --------------------------------------------------------------------------- +flow: + enabled: true + ## Namespace flow runs in — must match the helm install -n flag for the + ## flow release. ESO and the vault-config-job target this namespace. + namespace: flow + ## --------------------------------------------------------------------------- ## PostgreSQL — Zalando postgres-operator HA cluster (3-node Patroni) ## Credentials are generated by the operator and distributed via ESO. diff --git a/helm/Chart.yaml b/helm/Chart.yaml index a36f8cde66..7b141e13dc 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -26,6 +26,9 @@ dependencies: - name: nico-dsx-exchange-consumer version: "0.1.0" condition: nico-dsx-exchange-consumer.enabled + - name: nico-flow + version: "0.1.0" + condition: nico-flow.enabled - name: nico-hardware-health version: "0.1.0" condition: nico-hardware-health.enabled diff --git a/helm/charts/nico-flow/Chart.yaml b/helm/charts/nico-flow/Chart.yaml new file mode 100644 index 0000000000..fea8cf2de5 --- /dev/null +++ b/helm/charts/nico-flow/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: nico-flow +description: Helm chart for the NICo Flow component (formerly RLA) — rack lifecycle orchestrator with PSM/NSM sidecars +type: application +version: 0.1.0 +appVersion: "latest" +keywords: + - nico + - flow + - rla + - rack + - psm + - nsm diff --git a/helm/charts/nico-flow/templates/_helpers.tpl b/helm/charts/nico-flow/templates/_helpers.tpl new file mode 100644 index 0000000000..6ed1a75c75 --- /dev/null +++ b/helm/charts/nico-flow/templates/_helpers.tpl @@ -0,0 +1,90 @@ +{{/* +Resolve the namespace flow runs in. +*/}} +{{- define "nico-flow.namespace" -}} +{{- default .Release.Namespace .Values.namespaceOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Chart name + version label. +*/}} +{{- define "nico-flow.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end -}} + +{{/* +Common labels for every flow object. +*/}} +{{- define "nico-flow.labels" -}} +helm.sh/chart: {{ include "nico-flow.chart" . }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/part-of: site-controller +app.kubernetes.io/name: flow +app.kubernetes.io/component: orchestrator +{{- end -}} + +{{/* +Pod selector labels — must match the pod template labels in deployment.yaml. +The three sidecar Services all select on `app: flow` because they target the +same pod. +*/}} +{{- define "nico-flow.selectorLabels" -}} +app: flow +app.kubernetes.io/name: flow +{{- end -}} + +{{/* +Image references — one per container. If .repository is empty, fall +back to /nico-. Same for tag. +Usage: {{ include "nico-flow.image" (dict "component" "flow" "Values" .Values) }} +*/}} +{{- define "nico-flow.image" -}} +{{- $component := .component -}} +{{- $values := .Values -}} +{{- $override := index $values.images $component -}} +{{- $repo := $override.repository -}} +{{- if not $repo -}} +{{- $repo = printf "%s/nico-%s" $values.global.image.repository $component -}} +{{- end -}} +{{- $tag := $override.tag -}} +{{- if not $tag -}} +{{- $tag = $values.global.image.tag -}} +{{- end -}} +{{- printf "%s:%s" $repo $tag -}} +{{- end -}} + +{{/* +SPIFFE Certificate spec for flow (covers flow, psm, nsm Service DNS names). +*/}} +{{- define "nico-flow.certificateSpec" -}} +duration: {{ .global.certificate.duration }} +renewBefore: {{ .global.certificate.renewBefore }} +commonName: {{ printf "%s.%s.svc.cluster.local" .cert.serviceName .namespace }} +dnsNames: + - {{ printf "%s.%s.svc.cluster.local" .cert.serviceName .namespace }} + - {{ printf "%s.%s" .cert.serviceName .namespace }} +{{- range .cert.extraDnsNames | default list }} + - {{ printf "%s.%s.svc.cluster.local" . $.namespace }} + - {{ printf "%s.%s" . $.namespace }} +{{- end }} +uris: + ## Exactly one SPIFFE URI by design — carbide-core's authn middleware + ## (crates/authn/src/lib.rs) rejects certificates whose SAN extension carries + ## more than one URI. The single URI must satisfy nico-api end-to-end: + ## 1. trust domain is one of nico-api's spiffe_trust_domain(s) + ## 2. /sa/ matches nico-api's spiffe_service_base_paths + ## (decoupled from the Kubernetes namespace Flow runs in — flow's K8s + ## namespace `flow` is not in the allow-list; `nico-system` is) + ## 3. matches an InternalRBACRules principal + ## (the upstream rename PR teaches nico-api to accept both `nico-flow` + ## and `carbide-flow`, so either value works during the transition) + - {{ printf "spiffe://%s/%s/sa/%s" .global.spiffe.trustDomain .cert.apiIdentity.namespace .cert.apiIdentity.serviceName }} +privateKey: + algorithm: {{ .global.certificate.privateKey.algorithm }} + size: {{ .global.certificate.privateKey.size }} +issuerRef: + kind: {{ .global.certificate.issuerRef.kind }} + name: {{ .global.certificate.issuerRef.name }} + group: {{ .global.certificate.issuerRef.group }} +secretName: {{ .name }} +{{- end -}} diff --git a/helm/charts/nico-flow/templates/certificate.yaml b/helm/charts/nico-flow/templates/certificate.yaml new file mode 100644 index 0000000000..13b0e96c04 --- /dev/null +++ b/helm/charts/nico-flow/templates/certificate.yaml @@ -0,0 +1,44 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ .Values.certificate.name }} + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +spec: + {{- include "nico-flow.certificateSpec" (dict + "name" .Values.certificate.name + "cert" .Values.certificate + "global" .Values.global + "namespace" (include "nico-flow.namespace" .) + ) | nindent 2 }} +{{- if .Values.temporalCert.enabled }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ .Values.temporalCert.name }} + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +spec: + secretName: {{ .Values.temporalCert.secretName }} + duration: {{ .Values.temporalCert.duration }} + renewBefore: {{ .Values.temporalCert.renewBefore }} + commonName: {{ .Values.temporalCert.commonName }} + isCA: false + subject: + organizations: + - NVIDIA + privateKey: + algorithm: {{ .Values.temporalCert.privateKey.algorithm }} + size: {{ .Values.temporalCert.privateKey.size }} + usages: + - client auth + dnsNames: + {{- toYaml .Values.temporalCert.dnsNames | nindent 4 }} + issuerRef: + kind: {{ .Values.temporalCert.issuerRef.kind }} + name: {{ .Values.temporalCert.issuerRef.name }} + group: {{ .Values.temporalCert.issuerRef.group }} +{{- end }} diff --git a/helm/charts/nico-flow/templates/configmap.yaml b/helm/charts/nico-flow/templates/configmap.yaml new file mode 100644 index 0000000000..168b4daa2d --- /dev/null +++ b/helm/charts/nico-flow/templates/configmap.yaml @@ -0,0 +1,12 @@ +{{- if .Values.flowConfig }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: flow-config-files + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +data: + flowconfig.yaml: | +{{ .Values.flowConfig | indent 4 }} +{{- end }} diff --git a/helm/charts/nico-flow/templates/deployment.yaml b/helm/charts/nico-flow/templates/deployment.yaml new file mode 100644 index 0000000000..d69a5c17e4 --- /dev/null +++ b/helm/charts/nico-flow/templates/deployment.yaml @@ -0,0 +1,216 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flow + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicas }} + strategy: + type: RollingUpdate + selector: + matchLabels: + {{- include "nico-flow.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "nico-flow.labels" . | nindent 8 }} + {{- include "nico-flow.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: flow + spec: + automountServiceAccountToken: {{ .Values.automountServiceAccountToken }} + serviceAccountName: {{ .Values.serviceAccountName }} + {{- with .Values.global.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + ## ------------------------------------------------------------- + ## flow — main orchestrator container (gRPC 50051) + ## ------------------------------------------------------------- + - name: flow + image: {{ include "nico-flow.image" (dict "component" "flow" "Values" .Values) | quote }} + imagePullPolicy: {{ .Values.global.image.pullPolicy }} + env: + ## flow's GetDeploymentEnv (carbide-rest/flow/internal/service/config.go) + ## requires FLOW_ENV ∈ {development, staging, production}; empty/unset + ## causes the pod to fail startup validation with + ## "FLOW_ENV is required". No implicit default in code, so set one here. + - name: FLOW_ENV + value: {{ .Values.flowEnv | quote }} + - name: DB_NAME + value: {{ .Values.database.flow.name | quote }} + - name: DB_USER + valueFrom: + secretKeyRef: + name: {{ .Values.database.flow.credentialsSecret }} + key: username + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Values.database.flow.credentialsSecret }} + key: password + - name: DB_HOST + value: {{ .Values.database.host | quote }} + - name: DB_PORT + value: {{ .Values.database.port | quote }} + - name: DB_CERT_PATH + value: {{ .Values.database.certPath | quote }} + - name: NICO_CORE_API_URL + value: {{ .Values.nicoCoreApi.url | quote }} + - name: TEMPORAL_NAMESPACE + value: {{ .Values.temporal.namespace | quote }} + - name: TEMPORAL_HOST + value: {{ .Values.temporal.host | quote }} + - name: TEMPORAL_PORT + value: {{ .Values.temporal.port | quote }} + - name: TEMPORAL_CERT_PATH + value: {{ .Values.temporal.certPath | quote }} + - name: TEMPORAL_ENABLE_TLS + value: {{ .Values.temporal.enableTls | quote }} + - name: TEMPORAL_SERVER_NAME + value: {{ .Values.temporal.serverName | quote }} + - name: PSM_API_URL + value: {{ printf "psm.%s.svc.cluster.local:%d" (include "nico-flow.namespace" .) (int .Values.ports.psm) | quote }} + - name: NSM_API_URL + value: {{ printf "nsm.%s.svc.cluster.local:%d" (include "nico-flow.namespace" .) (int .Values.ports.nsm) | quote }} + {{- with .Values.extraEnv.flow }} + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: grpc + containerPort: {{ .Values.ports.flow }} + resources: + {{- toYaml .Values.resources.flow | nindent 12 }} + volumeMounts: + - name: spiffe + mountPath: /var/run/secrets/spiffe.io + readOnly: true + {{- if .Values.flowConfig }} + - name: flow-config-files + mountPath: /etc/flow + readOnly: true + {{- end }} + - name: temporal-client-certs + mountPath: {{ .Values.temporal.certPath }} + readOnly: true + ## ------------------------------------------------------------- + ## psm — power-shelf manager (gRPC 50052) + ## ------------------------------------------------------------- + - name: psm + image: {{ include "nico-flow.image" (dict "component" "psm" "Values" .Values) | quote }} + imagePullPolicy: {{ .Values.global.image.pullPolicy }} + env: + - name: PSM_PORT + value: {{ .Values.ports.psm | quote }} + - name: FW_DIR + value: {{ .Values.firmware.psm.mountPath | quote }} + - name: DB_NAME + value: {{ .Values.database.psm.name | quote }} + - name: DB_USER + valueFrom: + secretKeyRef: + name: {{ .Values.database.psm.credentialsSecret }} + key: username + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Values.database.psm.credentialsSecret }} + key: password + - name: DB_HOST + value: {{ .Values.database.host | quote }} + - name: DB_PORT + value: {{ .Values.database.port | quote }} + - name: DB_CERT_PATH + value: {{ .Values.database.certPath | quote }} + - name: VAULT_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.vault.psmTokenSecret }} + key: token + - name: VAULT_ADDR + value: {{ .Values.vault.address | quote }} + {{- with .Values.extraEnv.psm }} + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: grpc-psm + containerPort: {{ .Values.ports.psm }} + resources: + {{- toYaml .Values.resources.psm | nindent 12 }} + volumeMounts: + - name: spiffe + mountPath: /var/run/secrets/spiffe.io + readOnly: true + - name: psm-firmware + mountPath: {{ .Values.firmware.psm.mountPath }} + ## ------------------------------------------------------------- + ## nsm — NVLink switch manager (gRPC 50053) + ## ------------------------------------------------------------- + - name: nsm + image: {{ include "nico-flow.image" (dict "component" "nsm" "Values" .Values) | quote }} + imagePullPolicy: {{ .Values.global.image.pullPolicy }} + env: + - name: NSM_PORT + value: {{ .Values.ports.nsm | quote }} + - name: DB_DATABASE + value: {{ .Values.database.nsm.name | quote }} + - name: DB_USER + valueFrom: + secretKeyRef: + name: {{ .Values.database.nsm.credentialsSecret }} + key: username + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: {{ .Values.database.nsm.credentialsSecret }} + key: password + - name: DB_ADDR + value: {{ .Values.database.host | quote }} + - name: DB_PORT + value: {{ .Values.database.port | quote }} + - name: DB_CERT_PATH + value: {{ .Values.database.certPath | quote }} + - name: VAULT_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.vault.nsmTokenSecret }} + key: token + - name: VAULT_ADDR + value: {{ .Values.vault.address | quote }} + - name: FW_FIRMWARE_DIR + value: {{ printf "%s/firmware" .Values.firmware.nsm.mountPath | quote }} + - name: FW_BUNDLES_DIR + value: {{ printf "%s/bundles" .Values.firmware.nsm.mountPath | quote }} + {{- with .Values.extraEnv.nsm }} + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: grpc-nsm + containerPort: {{ .Values.ports.nsm }} + resources: + {{- toYaml .Values.resources.nsm | nindent 12 }} + volumeMounts: + - name: spiffe + mountPath: /var/run/secrets/spiffe.io + readOnly: true + - name: nsm-scratch + mountPath: {{ .Values.firmware.nsm.mountPath }} + volumes: + - name: spiffe + secret: + secretName: {{ .Values.certificate.name }} + {{- if .Values.flowConfig }} + - name: flow-config-files + configMap: + name: flow-config-files + {{- end }} + - name: temporal-client-certs + secret: + secretName: {{ .Values.temporalCert.secretName }} + - name: psm-firmware + {{- toYaml .Values.firmware.psm.volume | nindent 10 }} + - name: nsm-scratch + {{- toYaml .Values.firmware.nsm.volume | nindent 10 }} diff --git a/helm/charts/nico-flow/templates/image-pull-secret.yaml b/helm/charts/nico-flow/templates/image-pull-secret.yaml new file mode 100644 index 0000000000..08c4e64575 --- /dev/null +++ b/helm/charts/nico-flow/templates/image-pull-secret.yaml @@ -0,0 +1,28 @@ +{{- if .Values.imagePullSecret.create }} +## Docker registry pull secret for the flow namespace. +## +## Rendered as a pre-install hook (weight -10) so it lands BEFORE the +## Deployment starts pulling. Without this, the flow pod hits 401 on the +## first ImagePull attempt and ImagePullBackOffs until helm gives up. +## Same ordering pattern as nico-rest-common/templates/secrets.yaml. +## +## The actual credential payload is rendered into +## .Values.imagePullSecret.dockerconfigjson by setup.sh phase 7i from +## REGISTRY_PULL_SECRET. The chart default is the empty-auth placeholder +## ({"auths":{}}) so installs against pre-loaded / air-gapped clusters +## remain valid without overrides. +apiVersion: v1 +kind: Secret +metadata: + name: image-pull-secret + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": before-hook-creation +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ .Values.imagePullSecret.dockerconfigjson | quote }} +{{- end }} diff --git a/helm/charts/nico-flow/templates/namespace.yaml b/helm/charts/nico-flow/templates/namespace.yaml new file mode 100644 index 0000000000..85cb1b4777 --- /dev/null +++ b/helm/charts/nico-flow/templates/namespace.yaml @@ -0,0 +1,13 @@ +{{- if .Values.createNamespace | default true }} +## Namespace for flow. Labeled nico.nvidia.com/managed=true so the +## nico-roots ClusterExternalSecret (in helm-prereqs) syncs the site CA in. +apiVersion: v1 +kind: Namespace +metadata: + name: {{ include "nico-flow.namespace" . }} + annotations: + helm.sh/resource-policy: keep + labels: + {{- include "nico-flow.labels" . | nindent 4 }} + nico.nvidia.com/managed: "true" +{{- end }} diff --git a/helm/charts/nico-flow/templates/rbac.yaml b/helm/charts/nico-flow/templates/rbac.yaml new file mode 100644 index 0000000000..cbeb21c37d --- /dev/null +++ b/helm/charts/nico-flow/templates/rbac.yaml @@ -0,0 +1,27 @@ +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flow + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +rules: + - apiGroups: ["cert-manager.io"] + resources: ["certificaterequests"] + verbs: ["create"] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: flow + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: flow +subjects: + - kind: ServiceAccount + name: {{ .Values.serviceAccountName }} + namespace: {{ include "nico-flow.namespace" . }} diff --git a/helm/charts/nico-flow/templates/service-account.yaml b/helm/charts/nico-flow/templates/service-account.yaml new file mode 100644 index 0000000000..f3d64b0952 --- /dev/null +++ b/helm/charts/nico-flow/templates/service-account.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.serviceAccountName }} + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} +automountServiceAccountToken: {{ .Values.automountServiceAccountToken }} diff --git a/helm/charts/nico-flow/templates/service-grpc.yaml b/helm/charts/nico-flow/templates/service-grpc.yaml new file mode 100644 index 0000000000..bbf345571e --- /dev/null +++ b/helm/charts/nico-flow/templates/service-grpc.yaml @@ -0,0 +1,55 @@ +## Headless gRPC services for the three containers in the flow pod. +## All three select on the same pod; targetPort routes to the per-container port. +apiVersion: v1 +kind: Service +metadata: + name: flow + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} + app: flow +spec: + clusterIP: None + selector: + {{- include "nico-flow.selectorLabels" . | nindent 4 }} + ports: + - port: {{ .Values.ports.flow }} + targetPort: {{ .Values.ports.flow }} + name: grpc + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: psm + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} + app: psm +spec: + clusterIP: None + selector: + {{- include "nico-flow.selectorLabels" . | nindent 4 }} + ports: + - port: {{ .Values.ports.psm }} + targetPort: {{ .Values.ports.psm }} + name: grpc + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: nsm + namespace: {{ include "nico-flow.namespace" . }} + labels: + {{- include "nico-flow.labels" . | nindent 4 }} + app: nsm +spec: + clusterIP: None + selector: + {{- include "nico-flow.selectorLabels" . | nindent 4 }} + ports: + - port: {{ .Values.ports.nsm }} + targetPort: {{ .Values.ports.nsm }} + name: grpc + protocol: TCP diff --git a/helm/charts/nico-flow/values.yaml b/helm/charts/nico-flow/values.yaml new file mode 100644 index 0000000000..3cb16696ed --- /dev/null +++ b/helm/charts/nico-flow/values.yaml @@ -0,0 +1,203 @@ +## ============================================================================= +## flow — rack lifecycle orchestrator (formerly RLA) +## ============================================================================= +## Single pod with three gRPC containers: flow (50051), psm (50052), nsm (50053). +## Each container has its own Postgres database (flow, psm, nsm) on the shared +## nico-pg-cluster. PSM and NSM also need a Vault token (provisioned by the +## helm-prereqs vault-config-job). +## +## Deploys into a dedicated `flow` namespace. The namespace must carry the +## label nico.nvidia.com/managed=true so ESO syncs nico-roots in. +## ============================================================================= + +## Global settings (overridden by umbrella chart) +global: + image: + repository: "" + tag: "" + pullPolicy: IfNotPresent + imagePullSecrets: [] + certificate: + duration: 720h0m0s + renewBefore: 360h0m0s + privateKey: + algorithm: ECDSA + size: 384 + issuerRef: + kind: ClusterIssuer + name: vault-nico-issuer + group: cert-manager.io + spiffe: + trustDomain: nico.local + labels: + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: site-controller + +## Override the release namespace (defaults to helm install -n ) +namespaceOverride: "" + +replicas: 1 +serviceAccountName: flow +automountServiceAccountToken: true + +## flow's GetDeploymentEnv requires FLOW_ENV to be set to one of +## "development", "staging", or "production". There is no implicit default in +## carbide-rest/flow/internal/service/config.go — an unset value causes the +## flow container to exit on startup validation. Override per environment. +flowEnv: production + +## Per-component image overrides. +## Repositories default to /nico-flow etc, but each can +## be overridden independently if you want to pin one container to a different tag. +images: + flow: + repository: "" # defaults to /nico-flow + tag: "" # defaults to global.image.tag + psm: + repository: "" # defaults to /nico-psm + tag: "" + nsm: + repository: "" # defaults to /nico-nsm + tag: "" + +## Ports the three containers listen on (must stay distinct — same pod). +ports: + flow: 50051 + psm: 50052 + nsm: 50053 + +resources: + flow: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 1 + memory: 1Gi + psm: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + nsm: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +## SPIFFE certificate covering all three Service DNS names. +## +## Two identities live in this struct: +## - serviceName + extraDnsNames drive the *intra-cluster* identity: Service +## DNS, the cert commonName, and additional DNS SANs. These are what +## in-cluster peers dial. +## - apiIdentity.{namespace,serviceName} drive the *SPIFFE URI* baked into +## the cert SAN. nico-api's auth middleware validates client certs against +## its own spiffe_service_base_paths and InternalRBACRules — and rejects +## any cert SAN that carries more than one URI (crates/authn/src/lib.rs +## enforces "exactly 1 URI"). So this URI must be correct on its own; no +## extraUris escape hatch. +certificate: + name: flow-certificate + serviceName: flow + ## Additional DNS names — psm + nsm services live in the same namespace. + extraDnsNames: + - psm + - nsm + ## SPIFFE URI components — emitted as the single URI SAN on the cert. + ## Defaults target nico-api with the upstream RBAC rename (PR #1907) in + ## place: serviceName `nico-flow` is accepted by the matcher alongside the + ## legacy `carbide-flow` alias. Override if your deployment uses different + ## base paths or principal names. + apiIdentity: + namespace: nico-system + serviceName: nico-flow + +## Temporal mTLS client certificate. +## Issued by the nico-rest-ca-issuer ClusterIssuer (installed by nico-rest setup). +temporalCert: + enabled: true + name: temporal-client-certs + secretName: temporal-client-certs + duration: 2160h0m0s + renewBefore: 360h0m0s + commonName: temporal-client + dnsNames: + - temporal-client + - flow + privateKey: + algorithm: RSA + size: 2048 + issuerRef: + kind: ClusterIssuer + name: nico-rest-ca-issuer + group: cert-manager.io + +## Database connection settings (shared host; per-container DB and secret). +database: + host: nico-pg-cluster.postgres.svc.cluster.local + port: 5432 + ## CA cert that the containers use for the postgres TLS connection. + ## Mounted from the SPIFFE secret (same ca.crt the operator signs with). + certPath: /var/run/secrets/spiffe.io/ca.crt + flow: + name: flow + credentialsSecret: flow.nico.nico-pg-cluster.credentials + psm: + name: psm + credentialsSecret: psm.nico.nico-pg-cluster.credentials + nsm: + name: nsm + credentialsSecret: nsm.nico.nico-pg-cluster.credentials + +## NICo Core API endpoint (gRPC) that flow calls. +nicoCoreApi: + url: nico-api.nico-system.svc.cluster.local:1079 + +## Temporal endpoint and namespace. +temporal: + host: temporal-frontend-headless.temporal.svc.cluster.local + port: 7233 + namespace: flow + enableTls: true + serverName: interservice.server.temporal.local + certPath: /var/run/secrets/temporal-client-certs + +## Vault for PSM/NSM dynamic secrets. +vault: + address: https://vault.vault.svc.cluster.local:8200 + psmTokenSecret: psm-vault-token + nsmTokenSecret: nsm-vault-token + +## Firmware staging volumes (emptyDir by default; override with hostPath for sites +## that need a persistent firmware repo). +firmware: + psm: + volume: + emptyDir: {} + mountPath: /var/lib/psm/firmware + nsm: + volume: + emptyDir: {} + mountPath: /var/lib/nsm + +## Optional flowconfig.yaml — when set, becomes the flow-config-files ConfigMap. +flowConfig: "" + +extraEnv: + flow: [] + psm: [] + nsm: [] + +## Docker registry pull secret created by the chart in the flow namespace. +## setup.sh phase 7i renders dockerconfigjson from REGISTRY_PULL_SECRET and +## passes it via --set. The default placeholder ({"auths":{}}) lets the +## chart install on air-gapped / pre-loaded clusters without overrides. +## Set create=false to skip if the secret is provisioned out of band. +imagePullSecret: + create: true + dockerconfigjson: "eyJhdXRocyI6e319" diff --git a/helm/values.yaml b/helm/values.yaml index 4d16293fb7..25db9452ac 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -88,6 +88,18 @@ nico-dns: nico-dsx-exchange-consumer: enabled: true +## --------------------------------------------------------------------------- +## nico-flow — Flow (formerly RLA) rack lifecycle orchestrator +## Flow ships as a STANDALONE Helm release in its own namespace; it is not +## installed by `helm install nico ./helm`. The dependency is declared so the +## chart lives at helm/charts/nico-flow (Helm convention), but the condition +## keeps the umbrella from auto-rendering it. Deploy with: +## helm install flow ./helm/charts/nico-flow -n flow ... +## (helm-prereqs/setup.sh phase 7i does this for you.) +## --------------------------------------------------------------------------- +nico-flow: + enabled: false + ## --------------------------------------------------------------------------- ## nico-hardware-health — Hardware health collector ## Collects and reports hardware health metrics from managed machines.