Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions helm-prereqs/clean.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,12 @@ cd "${SCRIPT_DIR}"
# cert-manager and ClusterIssuers.
# ---------------------------------------------------------------------------
echo "=== [0/8] Uninstalling NICo REST stack ==="
helm uninstall nico-rest-site-agent -n nico-rest 2>/dev/null || true
helm uninstall nico-rest -n nico-rest 2>/dev/null || true
# Flow goes first — it talks to Temporal + nico-api and depends on credentials
# from both nico-prereqs (DB creds, vault tokens) and the REST stack.
helm uninstall flow -n flow 2>/dev/null || true
kubectl delete ns flow --wait=false --ignore-not-found 2>/dev/null || true
helm uninstall nico-rest-site-agent -n nico-rest 2>/dev/null || true
helm uninstall nico-rest -n nico-rest 2>/dev/null || true
helm uninstall temporal -n temporal 2>/dev/null || true

if kubectl get deploy keycloak -n nico-rest &>/dev/null; then
Expand All @@ -50,10 +54,10 @@ else
fi

kubectl delete clusterissuer nico-rest-ca-issuer --ignore-not-found 2>/dev/null || true
kubectl delete ns nico-rest temporal \
kubectl delete ns nico-rest temporal flow \
--wait=false --ignore-not-found 2>/dev/null || true
echo "Waiting for nico-rest and temporal namespaces to terminate..."
kubectl wait --for=delete ns/nico-rest ns/temporal \
echo "Waiting for nico-rest, temporal, and flow namespaces to terminate..."
kubectl wait --for=delete ns/nico-rest ns/temporal ns/flow \
--timeout=120s 2>/dev/null || true

# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -153,13 +157,15 @@ kubectl delete clusterissuer \
kubectl delete clustersecretstore \
cert-manager-ns-secretstore postgres-ns-secretstore \
--ignore-not-found 2>/dev/null || true
kubectl delete clusterexternalsecret nico-roots-eso nico-db-eso \
kubectl delete clusterexternalsecret \
nico-roots-eso nico-db-eso \
flow-db-eso psm-db-eso nsm-db-eso \
--ignore-not-found 2>/dev/null || true
kubectl delete clusterrole \
vault-pki-config-reader eso-postgres-ns-role \
vault-pki-config-reader eso-postgres-ns-role flow-vault-tokens-writer \
--ignore-not-found 2>/dev/null || true
kubectl delete clusterrolebinding \
vault-pki-config-reader eso-postgres-ns-rolebinding \
vault-pki-config-reader eso-postgres-ns-rolebinding flow-vault-tokens-writer \
--ignore-not-found 2>/dev/null || true

# ---------------------------------------------------------------------------
Expand Down
18 changes: 18 additions & 0 deletions helm-prereqs/health-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,26 @@ for _OPT_DEP in nico-hardware-health nico-ssh-console-rs nico-dsx-exchange-consu
fi
done

section "NICo Flow"
FLOW_NS="${FLOW_NS:-flow}"
if kc get ns "${FLOW_NS}" &>/dev/null; then
_check_deployment "${FLOW_NS}" flow
for _S in psm-vault-token nsm-vault-token \
flow.nico.nico-pg-cluster.credentials \
psm.nico.nico-pg-cluster.credentials \
nsm.nico.nico-pg-cluster.credentials \
flow-certificate temporal-client-certs nico-roots; do
_check_secret_exists "${FLOW_NS}" "${_S}"
done
else
skip "flow namespace not present — flow disabled or not yet deployed"
fi

section "NICo Jobs"
_check_job_complete "${NICO_NS}" vault-pki-config
if kc get job -n "${NICO_NS}" flow-vault-tokens &>/dev/null; then
_check_job_complete "${NICO_NS}" flow-vault-tokens
fi

# Migration job: find by label (name includes a random suffix)
_MIG_JOB=$(kc get jobs -n "${NICO_NS}" -l 'app.kubernetes.io/name=nico-api-migrate' \
Expand Down
159 changes: 156 additions & 3 deletions helm-prereqs/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@
# ./setup.sh -y # skip all prompts, deploy everything automatically
# ./setup.sh --skip-core # skip Phase 6 NICo Core (print command, deploy manually)
# ./setup.sh --skip-rest # skip Phase 7 NICo REST entirely (no repo needed)
# ./setup.sh --skip-flow # skip Phase 7i NICo Flow (REST still installs)
# # pair with helm-prereqs/values.yaml::flow.enabled=false
# # to skip Flow prereqs (DBs / ESO / vault tokens) too
# ./setup.sh --skip-core --skip-rest # fully non-interactive infra-only run
# ./setup.sh --core-values /path/to/values.yaml # use site-specific values for Phase 6
# ./setup.sh --metallb-config /path/to/metallb.yaml # use site-specific MetalLB config (file or kustomize dir)
Expand All @@ -78,6 +81,7 @@ cd "${SCRIPT_DIR}"
AUTO_YES=false
SKIP_CORE=false
SKIP_REST=false
SKIP_FLOW=false
CORE_VALUES=""
METALLB_CONFIG=""
SITE_OVERLAY=""
Expand All @@ -86,6 +90,7 @@ while [[ $# -gt 0 ]]; do
-y) AUTO_YES=true ;;
--skip-core) SKIP_CORE=true ;;
--skip-rest) SKIP_REST=true ;;
--skip-flow) SKIP_FLOW=true ;;
--debug) set -x ;;
--core-values)
[[ -z "${2:-}" ]] && { echo "Error: --core-values requires a file path"; exit 1; }
Expand All @@ -102,7 +107,7 @@ while [[ $# -gt 0 ]]; do
SITE_OVERLAY="$(cd "$(dirname "$2")" && pwd)/$(basename "$2")"
[[ ! -d "${SITE_OVERLAY}" ]] && { echo "Error: --site-overlay directory not found: $2"; exit 1; }
shift ;;
*) echo "Usage: $0 [-y] [--skip-core] [--skip-rest] [--core-values <file>] [--metallb-config <file-or-dir>] [--site-overlay <dir>] [--debug]"; exit 1 ;;
*) echo "Usage: $0 [-y] [--skip-core] [--skip-rest] [--skip-flow] [--core-values <file>] [--metallb-config <file-or-dir>] [--site-overlay <dir>] [--debug]"; exit 1 ;;
esac
shift
done
Expand All @@ -111,7 +116,7 @@ done
# Pre-flight checks — env vars, tools, config files, NICo REST repo
# Exports NICO_REST_REPO if resolved. Exits 1 if user declines to continue.
# ---------------------------------------------------------------------------
export AUTO_YES SKIP_CORE SKIP_REST
export AUTO_YES SKIP_CORE SKIP_REST SKIP_FLOW
# shellcheck source=preflight.sh
source "${SCRIPT_DIR}/preflight.sh"

Expand Down Expand Up @@ -591,6 +596,9 @@ kubectl exec -n temporal deploy/temporal-admintools -- \
sh -c "temporal operator namespace create -n cloud --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true
kubectl exec -n temporal deploy/temporal-admintools -- \
sh -c "temporal operator namespace create -n site --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true
# flow Temporal namespace — required by NICo Flow workers; pod panics on startup if absent.
kubectl exec -n temporal deploy/temporal-admintools -- \
sh -c "temporal operator namespace create -n flow --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true
echo "Temporal namespaces ready"

_SETUP_PHASE="[7g/7] NICo REST helm chart"
Expand Down Expand Up @@ -721,13 +729,25 @@ kubectl exec -n temporal deploy/temporal-admintools -- \
sh -c "temporal operator namespace create -n '${NICO_SITE_UUID}' --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true
echo "Temporal namespace ready"

# FLOW_GRPC_ENABLED toggles the site-agent's Flow gRPC client (see
# carbide-rest/site-agent/pkg/components/config/config_manager.go —
# strings.ToLower(env)=="true"). Without it, site-agent never opens a
# connection to the Flow pod deployed in phase 7i. We default it ON when
# Flow itself is being deployed; users can flip it back via --set when
# pairing --skip-flow.
_FLOW_GRPC_ENABLED="true"
if "${SKIP_FLOW}"; then
_FLOW_GRPC_ENABLED="false"
fi

helm upgrade --install nico-rest-site-agent "${NICO_SITE_AGENT_CHART}" \
"${NICO_SITE_AGENT_ARGS[@]}" \
--set "envConfig.CLUSTER_ID=${NICO_SITE_UUID}" \
--set "envConfig.TEMPORAL_SUBSCRIBE_NAMESPACE=${NICO_SITE_UUID}" \
--set "envConfig.TEMPORAL_SUBSCRIBE_QUEUE=site" \
--set "envConfig.FLOW_GRPC_ENABLED=${_FLOW_GRPC_ENABLED}" \
--timeout 300s --wait
echo "NICo REST site-agent deployed and bootstrap complete"
echo "NICo REST site-agent deployed and bootstrap complete (FLOW_GRPC_ENABLED=${_FLOW_GRPC_ENABLED})"

# Verify the site-agent's gRPC connection to NICo Core succeeded. The site-agent attempts
# the connection exactly once at startup with a 5-second deadline; if it
Expand Down Expand Up @@ -758,6 +778,139 @@ if [ "${_CONNECTED}" = "false" ]; then
echo "Site-agent pod restarted — gRPC connection will be retried"
fi

# --- 7i. NICo Flow ------------------------------------------------------------
# Flow is the rack lifecycle orchestrator (formerly RLA). Single pod with three
# containers — flow (50051), psm (50052), nsm (50053). Runs in its own `flow`
# namespace.
#
# Prerequisites already in place by this point:
# - flow/psm/nsm databases on nico-pg-cluster (helm-prereqs postgresql.yaml)
# - flow.nico/psm.nico/nsm.nico DB credentials synced via ESO into the flow
# namespace by the flow-db-eso / psm-db-eso / nsm-db-eso ClusterExternalSecrets
# - psm-vault-token and nsm-vault-token Secrets in the flow namespace
# (provisioned by the flow-vault-tokens post-install hook)
# - Temporal `flow` namespace (created in phase 7f above)
# - nico-rest-ca-issuer ClusterIssuer (installed by phase 7b — issues the
# temporal-client-certs)
# - vault-nico-issuer ClusterIssuer (issues the SPIFFE cert)
#
# Same pre-apply-cert dance as the site-agent: render the Certificate(s) ahead
# of the helm install so cert-manager has time to issue them and the pod doesn't
# hit a FailedMount race on the spiffe / temporal-client-certs secrets.
if "${SKIP_FLOW}"; then
echo "=== [7i/7] NICo Flow — skipped (--skip-flow) ==="
_SETUP_PHASE="complete"
exit 0
fi
_SETUP_PHASE="[7i/7] NICo Flow"
echo "=== [7i/7] NICo Flow ==="

NICO_FLOW_CHART="${SCRIPT_DIR}/../helm/charts/nico-flow"
NICO_FLOW_NAMESPACE="flow"

NICO_FLOW_ARGS=(
--namespace "${NICO_FLOW_NAMESPACE}"
--create-namespace
--set "global.image.repository=${NICO_IMAGE_REGISTRY}"
## Flow (nico-flow / nico-psm / nico-nsm) ships on the same image release
## line as NICo REST — they're built and tagged together — so reuse
## NICO_REST_IMAGE_TAG, not NICO_CORE_IMAGE_TAG (which is carbide-api).
--set "global.image.tag=${NICO_REST_IMAGE_TAG}"
)

# Render the dockerconfigjson for the chart-managed image-pull-secret. Same
# pattern as the NICo REST common chart — keep the registry credential on
# the helm command line so the chart template can install it as a
# pre-install hook (pod can't pull from nvcr.io otherwise).
if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then
_flow_registry_server="${NICO_IMAGE_REGISTRY%%/*}"
_flow_docker_cfg="$(printf '{"auths":{"%s":{"username":"%s","password":"%s"}}}' \
"${_flow_registry_server}" \
"${REGISTRY_PULL_USERNAME:-\$oauthtoken}" \
"${REGISTRY_PULL_SECRET}" | base64 | tr -d '\n')"
NICO_FLOW_ARGS+=(
--set "global.imagePullSecrets[0].name=image-pull-secret"
--set "imagePullSecret.dockerconfigjson=${_flow_docker_cfg}"
)
fi

# Pre-apply Certificates so cert-manager can issue secrets before the pod schedules.
echo "Pre-applying flow Certificates (SPIFFE + Temporal client)..."
helm template flow "${NICO_FLOW_CHART}" \
"${NICO_FLOW_ARGS[@]}" \
--show-only templates/namespace.yaml | kubectl apply -f -
helm template flow "${NICO_FLOW_CHART}" \
"${NICO_FLOW_ARGS[@]}" \
--show-only templates/certificate.yaml | kubectl apply -f -
kubectl annotate certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \
"meta.helm.sh/release-name=flow" \
"meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite
kubectl annotate certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \
"meta.helm.sh/release-name=flow" \
"meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite
kubectl label certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \
"app.kubernetes.io/managed-by=Helm" --overwrite
kubectl label certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \
"app.kubernetes.io/managed-by=Helm" --overwrite

# Annotate/label the namespace itself — the flow-vault-tokens-job (nico-prereqs
# helm hook) creates this namespace ahead of the flow release. Without Helm
# ownership metadata, helm install refuses to adopt it.
kubectl annotate namespace "${NICO_FLOW_NAMESPACE}" \
"meta.helm.sh/release-name=flow" \
"meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite
kubectl label namespace "${NICO_FLOW_NAMESPACE}" \
"app.kubernetes.io/managed-by=Helm" --overwrite

echo "Waiting for cert-manager to issue flow-certificate..."
kubectl wait --for=condition=Ready certificate/flow-certificate \
-n "${NICO_FLOW_NAMESPACE}" --timeout=120s
echo "Waiting for cert-manager to issue temporal-client-certs..."
kubectl wait --for=condition=Ready certificate/temporal-client-certs \
-n "${NICO_FLOW_NAMESPACE}" --timeout=120s

# Wait for the psm/nsm vault tokens and DB credential ESO syncs to land
# (provisioned by helm-prereqs hooks; may still be in flight if nico-prereqs
# was re-installed just before this phase). Fail-fast if any secret never
# shows up — the alternative (silently falling through to helm install) is
# 5 minutes of FailedMount-loop before helm gives up with an opaque message.
_wait_for_secret() {
local _name="$1"
local _ns="$2"
local _hint="$3"
for _i in $(seq 1 24); do
if kubectl get secret "${_name}" -n "${_ns}" >/dev/null 2>&1; then
echo " ${_name} ready"
return 0
fi
echo " Waiting for ${_name} (${_i}/24)..."
sleep 5
done
echo "ERROR: Secret ${_name} did not appear in namespace ${_ns} within 120s."
echo " ${_hint}"
return 1
}

echo "Waiting for psm/nsm Vault tokens..."
for _s in psm-vault-token nsm-vault-token; do
_wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \
"Provisioned by the flow-vault-tokens helm hook in nico-prereqs. Check 'kubectl logs -n nico-system job/flow-vault-tokens' and confirm helm-prereqs/values.yaml::flow.enabled=true."
done
Comment thread
shayan1995 marked this conversation as resolved.

echo "Waiting for flow/psm/nsm DB credentials..."
for _s in flow.nico.nico-pg-cluster.credentials \
psm.nico.nico-pg-cluster.credentials \
nsm.nico.nico-pg-cluster.credentials; do
_wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \
"Synced by the flow-db-eso/psm-db-eso/nsm-db-eso ClusterExternalSecrets in nico-prereqs. Check 'kubectl describe clusterexternalsecret -A | grep flow' and confirm helm-prereqs/values.yaml::flow.enabled=true."
done
Comment thread
shayan1995 marked this conversation as resolved.

echo "Installing flow helm chart..."
helm upgrade --install flow "${NICO_FLOW_CHART}" \
"${NICO_FLOW_ARGS[@]}" \
--timeout 300s --wait
echo "NICo Flow deployed"

echo ""
echo "========================================================================="
echo " Setup complete"
Expand Down
34 changes: 34 additions & 0 deletions helm-prereqs/templates/eso-external-secrets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,38 @@ spec:
key: nico-system.nico.nico-pg-cluster.credentials.postgresql.acid.zalan.do
version: v1
{{- end }}

{{/*
flow/psm/nsm DB credential syncs.
The Zalando operator generates one Secret per (user, cluster) pair in the
postgres namespace; ESO extracts each and projects it into the flow
namespace under the short name the flow chart consumes.
*/}}
{{- if and .Values.postgresql.enabled .Values.flow.enabled }}
{{- range $svc := list "flow" "psm" "nsm" }}
---
apiVersion: external-secrets.io/v1beta1
kind: ClusterExternalSecret
metadata:
name: {{ printf "%s-db-eso" $svc }}
spec:
namespaceSelector:
matchExpressions:
- key: kubernetes.io/metadata.name
operator: In
values: ["{{ $.Values.flow.namespace }}"]
refreshTime: 30s
externalSecretSpec:
secretStoreRef:
name: postgres-ns-secretstore
kind: ClusterSecretStore
target:
name: {{ printf "%s.nico.nico-pg-cluster.credentials" $svc }}
deletionPolicy: Retain
dataFrom:
- extract:
key: {{ printf "%s.nico.nico-pg-cluster.credentials.postgresql.acid.zalan.do" $svc }}
version: v1
{{- end }}
{{- end }}
{{- end }}
Loading
Loading