From b5b3114e71dab72eb341e4276531305bc4ec5c07 Mon Sep 17 00:00:00 2001
From: varashi <frank@boeye.net>
Date: Wed, 27 May 2026 08:45:29 +0200
Subject: [PATCH] =?UTF-8?q?feat(logging):=20migrate=20fluent-bit+adapter?=
 =?UTF-8?q?=20=E2=86=92=20Logging=20Operator=20(Fluentd=20CFAPI)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the previous standalone fluent-bit DaemonSet +
vcflogs-cfapi-adapter sidecar with the Logging Operator pattern.

Architectural reasons:

- The vmware-loginsight CFAPI plugin (1.4.2, bundled in the
  operator's ghcr.io/kube-logging/fluentd:v1.17-5.0-full image)
  replaces our 80-LOC homemade adapter. Maintenance burden moves
  off us.
- Operator-managed Fluent Bit (per-node collector) + Fluentd
  (HA ×2 aggregator) is the canonical k8s logging topology used at
  enterprise scale — direct knowledge transfer for the work-side
  vcflogs project.
- CRD-driven config (Logging / ClusterFlow / ClusterOutput)
  separates infrastructure from routing policy, enabling future
  per-namespace selectivity without ops involvement.

What lands:

- New ns "logging" (replaces "tanzu-system-logging")
- HelmRelease for logging-operator chart 6.5.2 from
  oci://ghcr.io/kube-logging/helm-charts
- Logging CR — Fluent Bit DS + Fluentd STS×2 specs
- ClusterOutput "vcflogs" — vmwareLogInsight HTTPS CFAPI
- ClusterFlow "all-to-vcflogs" — match *, single drop logtag
- New HelmRepository "kube-logging" (oci); removes "fluent" repo

What goes:

- "tanzu-system-logging" ns + HelmRelease + ConfigMap deleted
- "flux-repositories/fluent.yaml" HelmRepository deleted
- ghcr.io/varashi/vcflogs-cfapi-adapter:* image + Varashi/vcflogs-
  cfapi-adapter repo to be deleted manually after Flux reconciles
  successfully (one-way action; image referenced in #151 commit
  message remains discoverable via git history)

Hard-swap cutover: brief log gap (~2-5 min) while Flux deletes the
old DS, installs the operator, and reconciles the Logging CR.

Schema verified against logging-operator 6.5.2 CRDs:
- Logging: spec.fluentbit.inputTail uses native fluent-bit casing
  (Skip_Long_Lines / Mem_Buf_Limit / Refresh_Interval)
- ClusterOutput.spec.vmwareLogInsight: all fields valid
- ClusterFlow.spec.match[].select + filters[].record_modifier: valid

Plex configmaps + READMEs updated to reference the new ns/topology.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 cluster-talos/README.md                       |   2 +-
 .../apps/media/plex-test/app/README.md        |   4 +-
 .../app/configmap-plex-log-tail.yaml          |   8 +-
 .../kubernetes/apps/media/plex/app/README.md  |  11 +-
 .../plex/app/configmap-plex-log-tail.yaml     |   4 +-
 .../{fluent.yaml => kube-logging.yaml}        |   6 +-
 .../flux-repositories/kustomization.yaml      |   2 +-
 .../platform/kustomization.yaml               |   2 +-
 .../platform/logging/app/README.md            |  94 ++++++++++
 .../platform/logging/app/clusterflow-all.yaml |  30 ++++
 .../logging/app/clusteroutput-vcflogs.yaml    |  50 ++++++
 .../logging/app/helmrelease-operator.yaml     |  45 +++++
 .../platform/logging/app/kustomization.yaml   |  12 ++
 .../platform/logging/app/logging.yaml         |  83 +++++++++
 .../platform/logging/app/namespace.yaml       |  11 ++
 .../{tanzu-system-logging => logging}/ks.yaml |   4 +-
 .../tanzu-system-logging/app/helmrelease.yaml | 166 ------------------
 .../app/kustomization.yaml                    |   5 -
 .../tanzu-system-logging/app/namespace.yaml   |   8 -
 19 files changed, 349 insertions(+), 198 deletions(-)
 rename cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/{fluent.yaml => kube-logging.yaml} (58%)
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/README.md
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/clusterflow-all.yaml
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/clusteroutput-vcflogs.yaml
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/helmrelease-operator.yaml
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/kustomization.yaml
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/logging.yaml
 create mode 100644 cluster-talos/kubernetes/infrastructure/platform/logging/app/namespace.yaml
 rename cluster-talos/kubernetes/infrastructure/platform/{tanzu-system-logging => logging}/ks.yaml (74%)
 delete mode 100644 cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/helmrelease.yaml
 delete mode 100644 cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/kustomization.yaml
 delete mode 100644 cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/namespace.yaml

diff --git a/cluster-talos/README.md b/cluster-talos/README.md
index 8dbb697c..a10acaeb 100644
--- a/cluster-talos/README.md
+++ b/cluster-talos/README.md
@@ -110,7 +110,7 @@ cluster-talos/
     │       ├── cloudflare-operator-system/
     │       │   ├── cloudflare-operator/ # ClusterTunnel CRD + operator
     │       │   └── cloudflare-tunnel/   # ClusterTunnel instance + default TunnelBinding
-    │       ├── tanzu-system-logging/    # Fluent-Bit DaemonSet → VCF Operations for Logs (syslog rfc5424)
+    │       ├── logging/                 # Logging Operator: Fluent Bit DS → Fluentd STS → VCF Operations for Logs (CFAPI)
     │       ├── kasten-io/               # Kasten K10 (LDAPS to AD)
     │       ├── spegel/                  # P2P containerd image cache
     │       ├── renovate/                # Dependency update bot
diff --git a/cluster-talos/kubernetes/apps/media/plex-test/app/README.md b/cluster-talos/kubernetes/apps/media/plex-test/app/README.md
index 30ce1cde..53d90e23 100644
--- a/cluster-talos/kubernetes/apps/media/plex-test/app/README.md
+++ b/cluster-talos/kubernetes/apps/media/plex-test/app/README.md
@@ -28,8 +28,8 @@ Mirrors the same mechanism the prod `plex` HR uses (see
 (`plex-log-tail`) that runs `tail -n0 -F` on
 `/config/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log`
 as uid 1000 inside the `pms` container. Output flows to container
-stdout → fluent-bit DaemonSet in `tanzu-system-logging` →
-`skw-vcflogs.boeye.net:514`.
+stdout → Logging Operator Fluent Bit DS (`logging` ns) → Fluentd
+aggregator → `skw-vcflogs.boeye.net:9543` CFAPI HTTPS.
 
 Only delta vs prod: the persistence entry hangs off controller key `pms`
 (plex-test calls the PMS container that, not `app` like prod does) and
diff --git a/cluster-talos/kubernetes/apps/media/plex-test/app/configmap-plex-log-tail.yaml b/cluster-talos/kubernetes/apps/media/plex-test/app/configmap-plex-log-tail.yaml
index 5be1d3de..dd87647e 100644
--- a/cluster-talos/kubernetes/apps/media/plex-test/app/configmap-plex-log-tail.yaml
+++ b/cluster-talos/kubernetes/apps/media/plex-test/app/configmap-plex-log-tail.yaml
@@ -1,10 +1,10 @@
 ---
 # Same pattern as prod plex (cluster-talos/kubernetes/apps/media/plex/app/
 # configmap-plex-log-tail.yaml) — s6-overlay longrun that tails PMS's
-# file-based log into stdout so fluent-bit (tanzu-system-logging
-# DaemonSet) ships request/decision detail to skw-vcflogs alongside
-# the regular container logs. `tail -F` follows by name across PMS's
-# internal 10 MB log rotation.
+# file-based log into stdout so the Logging Operator's Fluent Bit
+# DaemonSet (ns=logging) ships request/decision detail to skw-vcflogs
+# alongside the regular container logs. `tail -F` follows by name
+# across PMS's internal 10 MB log rotation.
 apiVersion: v1
 kind: ConfigMap
 metadata:
diff --git a/cluster-talos/kubernetes/apps/media/plex/app/README.md b/cluster-talos/kubernetes/apps/media/plex/app/README.md
index 744863be..385bcc11 100644
--- a/cluster-talos/kubernetes/apps/media/plex/app/README.md
+++ b/cluster-talos/kubernetes/apps/media/plex/app/README.md
@@ -58,10 +58,13 @@ client) is in memory `reference_pms_html_tv_app_ac3_override.md`.
 `configmap-plex-log-tail.yaml` registers an s6-overlay v3 longrun named
 `plex-log-tail` inside the plex container. It tails the file Plex writes
 to (`/config/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log`)
-into the container's stdout. The cluster's fluent-bit DaemonSet
-(`tanzu-system-logging`) picks it up along with the rest of `/var/log/
-containers/*.log` and ships everything to `skw-vcflogs.boeye.net:514`
-via syslog RFC5424.
+into the container's stdout. The Logging Operator's Fluent Bit
+DaemonSet (`logging` ns) picks it up along with the rest of
+`/var/log/containers/*.log` and forwards to the Fluentd aggregator,
+which posts to `skw-vcflogs.boeye.net:9543` via CFAPI HTTPS (the
+old syslog/RFC5424 path was replaced 2026-05-27 to escape its
+2048-byte per-message cap that clipped long Plex Web Request
+lines).
 
 The longrun layout mirrors how `scaleplex_pms_dockermod` already wires
 `scaleplex-relay` — three files mounted under `/etc/s6-overlay/s6-rc.d/`:
diff --git a/cluster-talos/kubernetes/apps/media/plex/app/configmap-plex-log-tail.yaml b/cluster-talos/kubernetes/apps/media/plex/app/configmap-plex-log-tail.yaml
index a0ef04ce..93216650 100644
--- a/cluster-talos/kubernetes/apps/media/plex/app/configmap-plex-log-tail.yaml
+++ b/cluster-talos/kubernetes/apps/media/plex/app/configmap-plex-log-tail.yaml
@@ -4,8 +4,8 @@
 # `/config/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log`
 # (no exposed knob to redirect that to stdout); without this service
 # the verbose request/decision lines never reach kubectl/Freelens or
-# the tanzu-system-logging fluent-bit DaemonSet that ships container
-# stdout to skw-vcflogs.boeye.net:514.
+# the Logging Operator's Fluent Bit DaemonSet (ns=logging) that
+# tails container stdout → Fluentd aggregator → skw-vcflogs CFAPI.
 #
 # Pattern mirrors the scaleplex_pms_dockermod's `scaleplex-relay`
 # longrun (same s6-overlay v3 layout — three files per service:
diff --git a/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/fluent.yaml b/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kube-logging.yaml
similarity index 58%
rename from cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/fluent.yaml
rename to cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kube-logging.yaml
index 01a50e18..a1d1c435 100644
--- a/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/fluent.yaml
+++ b/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kube-logging.yaml
@@ -1,8 +1,10 @@
+---
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
-  name: fluent
+  name: kube-logging
   namespace: flux-system
 spec:
+  type: oci
+  url: oci://ghcr.io/kube-logging/helm-charts
   interval: 1h
-  url: https://fluent.github.io/helm-charts
diff --git a/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kustomization.yaml b/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kustomization.yaml
index 14308bf1..41a0a87f 100644
--- a/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kustomization.yaml
+++ b/cluster-talos/kubernetes/infrastructure/flux-system/flux-repositories/kustomization.yaml
@@ -20,6 +20,6 @@ resources:
   - prometheus-community.yaml
   - descheduler.yaml
   - stakater.yaml
-  - fluent.yaml
+  - kube-logging.yaml
   - gpu-node-vsphere-maintenance-controller.yaml
   - authentik.yaml
diff --git a/cluster-talos/kubernetes/infrastructure/platform/kustomization.yaml b/cluster-talos/kubernetes/infrastructure/platform/kustomization.yaml
index 95e4efbc..2faab0f3 100644
--- a/cluster-talos/kubernetes/infrastructure/platform/kustomization.yaml
+++ b/cluster-talos/kubernetes/infrastructure/platform/kustomization.yaml
@@ -13,7 +13,7 @@ resources:
   - external-dns/external-dns-cloudflare/ks.yaml
   - cloudflare-operator-system/cloudflare-operator/ks.yaml
   - cloudflare-operator-system/cloudflare-tunnel/ks.yaml
-  - tanzu-system-logging/ks.yaml
+  - logging/ks.yaml
   - kasten-io/ks.yaml
   - spegel/ks.yaml
   - renovate/ks.yaml
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/README.md b/cluster-talos/kubernetes/infrastructure/platform/logging/app/README.md
new file mode 100644
index 00000000..7f09b40a
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/README.md
@@ -0,0 +1,94 @@
+# `logging/` — cluster log forwarding to vcflogs via Logging Operator
+
+Replaces the previous `tanzu-system-logging/` stack (standalone
+fluent-bit DS + custom `vcflogs-cfapi-adapter` sidecar). Migrated
+2026-05-27 to the [Logging Operator](https://kube-logging.dev/)
+pattern so the VMware Aria CFAPI translation comes from a
+maintained-by-VMware plugin instead of code we own.
+
+## Architecture
+
+```
+/var/log/containers/*.log   on every node
+        │
+        ▼
+┌──────────────────────────────────────────────────────────────┐
+│  Fluent Bit DaemonSet (operator-managed, ns=logging)         │
+│    INPUT  tail        /var/log/containers/*.log              │
+│    FILTER kubernetes  enrich w/ pod/ns/labels                │
+│    OUTPUT forward     → Fluentd Service (operator-managed)   │
+└─────────────────────────────┬────────────────────────────────┘
+                              │ Fluentd forward protocol
+                              │ (port 24240, in-cluster)
+                              ▼
+┌──────────────────────────────────────────────────────────────┐
+│  Fluentd StatefulSet ×2 (HA, operator-managed)               │
+│    @type forward                                             │
+│    @type vmware_loginsight (fluent-plugin-vmware-loginsight) │
+│      → CFAPI POST {"events":[…]}                             │
+└─────────────────────────────┬────────────────────────────────┘
+                              │ HTTPS POST
+                              ▼
+              skw-vcflogs.boeye.net:9543
+              /api/v1/events/ingest/k8s-talos
+```
+
+No 2048-byte syslog cap. No homemade adapter. The
+`fluent-plugin-vmware-loginsight` gem (v1.4.2) is bundled in the
+operator's `ghcr.io/kube-logging/fluentd:v1.17-5.0-full` image —
+nothing to build.
+
+## CRD breakdown
+
+| File | CRD | Purpose |
+|---|---|---|
+| `helmrelease-operator.yaml` | `HelmRelease` | Installs the operator + CRDs |
+| `logging.yaml` | `Logging` | Declares the pipeline (which Fluent Bit + Fluentd specs to render) |
+| `clusteroutput-vcflogs.yaml` | `ClusterOutput` | The vmwareLogInsight destination, cluster-scoped |
+| `clusterflow-all.yaml` | `ClusterFlow` | "Match everything → send to vcflogs" |
+
+The split between Logging (infrastructure) and Flow/Output (routing)
+is intentional in the operator design — Logging is platform, Flow
++ Output is policy. At a multi-tenant work-scale, namespaces would
+get their own `Flow` CRs (namespace-scoped, can only target outputs
+their team owns), while ops would manage `ClusterFlow` /
+`ClusterOutput` for cross-cutting destinations.
+
+## What's where in the cluster
+
+- **`logging` namespace** holds the operator pod + Fluent Bit DS + Fluentd STS
+- **Fluent Bit pods** mount `hostPath: /var/log/containers` to read CRI logs
+- **Fluentd pods** mount `5Gi` PVC each (`longhorn` StorageClass) for the file buffer that absorbs vcflogs back-pressure
+- **leader election** uses a Lease in this ns
+
+## Tuning knobs
+
+| What | Where |
+|---|---|
+| Fluent Bit resources / tolerations | `logging.yaml` → `spec.fluentbit` |
+| Fluentd replicas (HA) | `logging.yaml` → `spec.fluentd.scaling.replicas` |
+| Fluentd buffer size / storage class | `logging.yaml` → `spec.fluentd.bufferStorageVolume.pvc` |
+| CFAPI endpoint / TLS posture | `clusteroutput-vcflogs.yaml` → `spec.vmwareLogInsight` |
+| Buffer flush cadence / retry | `clusteroutput-vcflogs.yaml` → `spec.vmwareLogInsight.buffer` |
+| Per-namespace routing | replace `clusterflow-all.yaml` with multiple `Flow` / `ClusterFlow` CRs |
+
+## Reverting (if needed)
+
+```
+git revert <merge-commit-of-this-PR>
+flux reconcile kustomization platform -n flux-system
+```
+
+This re-creates the old `tanzu-system-logging/fluent-bit` HelmRelease.
+The `vcflogs-cfapi-adapter` ghcr.io image was deleted with this
+migration — re-installing would put the cluster back on the
+2048-byte syslog cap until the image is rebuilt and republished
+from `git history`.
+
+## References
+
+- [Logging Operator docs](https://kube-logging.dev/docs/)
+- [`vmwareLogInsight` output reference](https://kube-logging.dev/docs/configuration/plugins/outputs/vmware_loginsight/)
+- [`fluent-plugin-vmware-loginsight` upstream (archived)](https://github.com/vmware-archive/fluent-plugin-vmware-loginsight)
+- VMware Aria Operations for Logs [ingest API](https://developer.broadcom.com/xapis/vrealize-log-insight-api/latest/)
+- Predecessor: PR Varashi/k8s#151 (homemade vcflogs-cfapi-adapter sidecar)
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/clusterflow-all.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/app/clusterflow-all.yaml
new file mode 100644
index 00000000..b3e3ca58
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/clusterflow-all.yaml
@@ -0,0 +1,30 @@
+---
+# ClusterFlow — the routing rule. Matches *everything* from every
+# namespace and sends it to the vcflogs ClusterOutput. Mirrors the
+# previous fluent-bit `[OUTPUT] syslog Match kube.*` behavior:
+# every container log goes to vcflogs, no per-namespace filtering.
+#
+# When we want per-namespace selectivity later (e.g., suppress noisy
+# system logs), replace this with multiple Flow/ClusterFlow CRs and
+# selectors. For now: one ClusterFlow, one ClusterOutput.
+apiVersion: logging.banzaicloud.io/v1beta1
+kind: ClusterFlow
+metadata:
+  name: all-to-vcflogs
+  namespace: logging
+spec:
+  match:
+    # Match every namespace. The empty `select: {}` is the
+    # operator's canonical "match all" form.
+    - select: {}
+  globalOutputRefs:
+    - vcflogs
+  # Filters run in order. Tag normalization keeps the records
+  # parsable downstream; the kubernetes_metadata enrichment is
+  # added automatically by the operator for Fluent Bit collection.
+  filters:
+    # Drop fluent-bit's per-line `logtag` (P/F) marker — it's a
+    # CRI partial-line indicator, not useful to vcflogs. Keep
+    # everything else.
+    - record_modifier:
+        remove_keys: logtag
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/clusteroutput-vcflogs.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/app/clusteroutput-vcflogs.yaml
new file mode 100644
index 00000000..0ac9ef4f
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/clusteroutput-vcflogs.yaml
@@ -0,0 +1,50 @@
+---
+# ClusterOutput — the destination. Cluster-scoped so any namespace
+# can route to it via a ClusterFlow. Wraps the
+# `fluent-plugin-vmware-loginsight` (v1.4.2, bundled in the operator's
+# v1.17-5.0-full Fluentd image) and POSTs to vRealize Log Insight's
+# CFAPI ingest endpoint.
+#
+# Why CFAPI (HTTPS) vs syslog (TCP/514): the syslog path enforces
+# RFC 5424's 2048-byte per-message cap. Plex Web Request: lines run
+# ~3800 bytes when X-Plex-Client-Profile-Extra is present, getting
+# clipped mid-token at byte 2040 — losing X-Plex-Product /
+# X-Plex-Version / X-Plex-Token from the tail. CFAPI has no
+# documented size cap. (See:
+#   https://github.com/Varashi/k8s/pull/151 for the prior homemade
+#   sidecar that did this translation by hand before we adopted the
+#   operator pattern.)
+apiVersion: logging.banzaicloud.io/v1beta1
+kind: ClusterOutput
+metadata:
+  name: vcflogs
+  namespace: logging
+spec:
+  vmwareLogInsight:
+    scheme: https
+    # Aria appliance ships a self-signed cert; we accept it on the
+    # internal network. Flip to true once the cert gets a real CA.
+    ssl_verify: false
+    host: "skw-vcflogs.${SECRET_DOMAIN}"
+    port: 9543
+    # CFAPI's agent_id is an arbitrary tag the receiver records
+    # against each event — not auth, just an identifier so vcflogs
+    # can attribute the stream. Matches what the previous adapter
+    # used (k8s-talos).
+    agent_id: k8s-talos
+    # Default `log_text_keys` (`log`, `msg`, `message`) is what
+    # we want — the CRI parser populates `log`/`message` for
+    # container stdout/stderr, and the kubernetes_metadata filter
+    # may copy to `msg`.
+    log_text_keys:
+      - log
+      - msg
+      - message
+    # Plugin retries on 5xx and drops on 4xx by default. Buffer
+    # chunk + flush cadence inherited from the Logging CR's
+    # fluentd.bufferStorageVolume PVC.
+    buffer:
+      flush_interval: 5s
+      retry_max_interval: 30s
+      chunk_limit_size: 8MB
+      total_limit_size: 1GB
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/helmrelease-operator.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/app/helmrelease-operator.yaml
new file mode 100644
index 00000000..05622dee
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/helmrelease-operator.yaml
@@ -0,0 +1,45 @@
+---
+# Logging Operator — installs the controller that watches Logging /
+# Flow / Output CRDs and reconciles a Fluent Bit DaemonSet + Fluentd
+# StatefulSet from them. The chart ships only the operator + CRDs;
+# the actual logging pipeline is declared via the CRs in the
+# sibling logging.yaml / clusteroutput-*.yaml / clusterflow-*.yaml.
+#
+# Docs: https://kube-logging.dev/docs/
+# Chart: oci://ghcr.io/kube-logging/helm-charts/logging-operator
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: logging-operator
+  namespace: logging
+spec:
+  interval: 30m
+  chart:
+    spec:
+      chart: logging-operator
+      version: 6.5.2
+      sourceRef:
+        kind: HelmRepository
+        name: kube-logging
+        namespace: flux-system
+  install:
+    remediation:
+      retries: 3
+    crds: CreateReplace
+  upgrade:
+    cleanupOnFail: true
+    crds: CreateReplace
+    remediation:
+      strategy: rollback
+      retries: 3
+  values:
+    # Keep the operator itself lean — it just watches CRDs.
+    resources:
+      requests: {cpu: 10m, memory: 64Mi}
+      limits:   {cpu: 200m, memory: 256Mi}
+    # The operator's leader election uses a Lease in this ns.
+    enableLeaderElection: true
+    # Don't ship the bundled `logging` resource — we declare ours
+    # explicitly in logging.yaml so it's GitOps-visible.
+    logging:
+      enabled: false
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/kustomization.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/app/kustomization.yaml
new file mode 100644
index 00000000..63035bb4
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/kustomization.yaml
@@ -0,0 +1,12 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - namespace.yaml
+  - helmrelease-operator.yaml
+  # Order below matters at first apply: the operator's CRDs (Logging,
+  # Flow, ClusterFlow, Output, ClusterOutput) must exist before the
+  # CRs below can install. Flux retries on missing-CRD failures, so
+  # this resolves on the second reconcile if both arrive together.
+  - logging.yaml
+  - clusteroutput-vcflogs.yaml
+  - clusterflow-all.yaml
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/logging.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/app/logging.yaml
new file mode 100644
index 00000000..b8997e54
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/logging.yaml
@@ -0,0 +1,83 @@
+---
+# Logging — the operator's top-level CRD. Declares the
+# infrastructure (Fluent Bit collector DaemonSet + Fluentd aggregator
+# StatefulSet) for one logical logging pipeline. Cluster-wide CRDs
+# (ClusterFlow / ClusterOutput) bind to this Logging by name.
+#
+# Pattern: lightweight per-node collectors → central aggregator(s)
+# that run the heavy output plugins. The vmware-loginsight gem
+# (fluent-plugin-vmware-loginsight 1.4.2) is bundled in
+# ghcr.io/kube-logging/fluentd:v1.17-5.0-full, so no custom image.
+apiVersion: logging.banzaicloud.io/v1beta1
+kind: Logging
+metadata:
+  name: vcflogs
+  namespace: logging
+spec:
+  controlNamespace: logging
+
+  # ---------- Fluent Bit (per-node collector DaemonSet) ----------
+  fluentbit:
+    # Use a recent stable Fluent Bit image. The Logging Operator
+    # generates the config; we tune resources + extra positions DB.
+    image:
+      repository: fluent/fluent-bit
+      tag: 3.1.10
+      pullPolicy: IfNotPresent
+    resources:
+      requests: {cpu: 50m, memory: 100Mi}
+      limits:   {cpu: 500m, memory: 200Mi}
+    tolerations:
+      - operator: Exists
+    # Buffer chunk + memory limit roughly matches what the previous
+    # standalone fluent-bit DS used (Mem_Buf_Limit 5MB).
+    bufferStorageVolume:
+      hostPath:
+        path: "/var/lib/fluent-bit-buffer"
+    # Skip lines that exceed the input buffer (default) — but our
+    # forward-out to Fluentd has no syslog 2048 ceiling, so any
+    # long log line that survives Fluent Bit's parser makes it to
+    # the aggregator intact. Field names mirror fluent-bit's
+    # native [INPUT] tail option casing (`Skip_Long_Lines`,
+    # `Mem_Buf_Limit`, `Refresh_Interval`) — the operator passes
+    # them through to the rendered config verbatim.
+    inputTail:
+      Skip_Long_Lines: "On"
+      Mem_Buf_Limit: 5MB
+      Refresh_Interval: "10"
+
+  # ---------- Fluentd (central aggregator StatefulSet) ----------
+  fluentd:
+    # Image MUST be the `full` variant — it carries the
+    # fluent-plugin-vmware-loginsight gem we need for CFAPI output.
+    # `filters` variant only has filter plugins.
+    image:
+      repository: ghcr.io/kube-logging/fluentd
+      tag: v1.17-5.0-full
+      pullPolicy: IfNotPresent
+    # HA: 2 replicas. Fluent Bit's forward output round-robins, so
+    # losing one Fluentd pod doesn't drop logs.
+    scaling:
+      replicas: 2
+    resources:
+      requests: {cpu: 100m, memory: 256Mi}
+      limits:   {cpu: 1000m, memory: 1Gi}
+    # File-buffer on a small PVC per replica — survives pod restart,
+    # absorbs vcflogs back-pressure. PVC size chosen to hold ~1h of
+    # peak log volume (LogVerbose=1 on prod plex can push 200MB/h).
+    bufferStorageVolume:
+      pvc:
+        spec:
+          accessModes: [ReadWriteOnce]
+          resources:
+            requests:
+              storage: 5Gi
+          # k8s-talos default StorageClass is longhorn — change here
+          # if/when we migrate (memory: longhorn removal planned).
+          storageClassName: longhorn
+    # Tolerate all nodes so the aggregator can land anywhere if
+    # capacity tightens.
+    tolerations:
+      - operator: Exists
+    # Don't let the aggregator block node maintenance for hours.
+    terminationGracePeriodSeconds: 60
diff --git a/cluster-talos/kubernetes/infrastructure/platform/logging/app/namespace.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/app/namespace.yaml
new file mode 100644
index 00000000..b82524ef
--- /dev/null
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/app/namespace.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: logging
+  labels:
+    # Logging Operator's Fluent Bit DaemonSet mounts /var/log/containers
+    # (hostPath) and needs `privileged` PSA to mount those paths.
+    # Same posture as the previous tanzu-system-logging ns.
+    pod-security.kubernetes.io/enforce: privileged
+    pod-security.kubernetes.io/audit: privileged
+    pod-security.kubernetes.io/warn: privileged
diff --git a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/ks.yaml b/cluster-talos/kubernetes/infrastructure/platform/logging/ks.yaml
similarity index 74%
rename from cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/ks.yaml
rename to cluster-talos/kubernetes/infrastructure/platform/logging/ks.yaml
index 87f6883e..4ed141fe 100644
--- a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/ks.yaml
+++ b/cluster-talos/kubernetes/infrastructure/platform/logging/ks.yaml
@@ -2,13 +2,13 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
-  name: tanzu-system-logging
+  name: logging
   namespace: flux-system
 spec:
   interval: 1h
   dependsOn:
     - name: configs
-  path: ./cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app
+  path: ./cluster-talos/kubernetes/infrastructure/platform/logging/app
   prune: true
   sourceRef:
     kind: GitRepository
diff --git a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/helmrelease.yaml b/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/helmrelease.yaml
deleted file mode 100644
index 7362a343..00000000
--- a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/helmrelease.yaml
+++ /dev/null
@@ -1,166 +0,0 @@
-apiVersion: helm.toolkit.fluxcd.io/v2
-kind: HelmRelease
-metadata:
-  name: fluent-bit
-  namespace: tanzu-system-logging
-spec:
-  interval: 30m
-  chart:
-    spec:
-      chart: fluent-bit
-      version: 0.57.5
-      sourceRef:
-        kind: HelmRepository
-        name: fluent
-        namespace: flux-system
-  install:
-    remediation:
-      retries: 3
-  upgrade:
-    cleanupOnFail: true
-    remediation:
-      strategy: rollback
-      retries: 3
-  values:
-    fullnameOverride: fluent-bit
-    kind: DaemonSet
-    tolerations:
-      - operator: Exists
-    # vcflogs-cfapi-adapter sidecar — translates fluent-bit's [OUTPUT]
-    # http (top-level JSON array) into the {"events":[...]} wrapper
-    # vRealize Log Insight's CFAPI ingest expects. Was on syslog
-    # (TCP/514) until 2026-05-27; that path enforced RFC 5424's
-    # 2048-byte cap, clipping long Plex Web Request: lines past byte
-    # 2040 (the X-Plex-Client-Profile-Extra header alone runs 1500+
-    # bytes, taking X-Plex-Product/Version/Token with it into the
-    # cut). CFAPI has no documented size cap. See
-    # github.com/Varashi/vcflogs-cfapi-adapter for the adapter source.
-    extraContainers:
-      - name: vcflogs-cfapi-adapter
-        image: ghcr.io/varashi/vcflogs-cfapi-adapter:v0.1.0
-        imagePullPolicy: IfNotPresent
-        env:
-          # Bind to 0.0.0.0 (all interfaces) inside the container's
-          # netns so kubelet's httpGet probe (which connects from the
-          # node, NOT the pod) can reach the port. The container netns
-          # is pod-scoped — no Service/hostPort means nothing outside
-          # the pod can reach this port anyway. fluent-bit (same pod,
-          # same netns) still reaches us via 127.0.0.1:8080.
-          - name: LISTEN_ADDR
-            value: ":8080"
-          - name: VCFLOGS_INGEST_URL
-            value: "https://skw-vcflogs.${SECRET_DOMAIN}:9543/api/v1/events/ingest/k8s-talos"
-          - name: VCFLOGS_TLS_INSECURE
-            value: "true"
-          - name: FORWARD_TIMEOUT
-            value: "15s"
-        ports:
-          - name: ingest
-            containerPort: 8080
-        resources:
-          requests: {cpu: 10m, memory: 32Mi}
-          limits:   {cpu: 100m, memory: 128Mi}
-        livenessProbe:
-          httpGet: {path: /healthz, port: ingest}
-          initialDelaySeconds: 5
-          periodSeconds: 30
-        readinessProbe:
-          httpGet: {path: /healthz, port: ingest}
-          initialDelaySeconds: 2
-          periodSeconds: 10
-        securityContext:
-          allowPrivilegeEscalation: false
-          readOnlyRootFilesystem: true
-          runAsNonRoot: true
-          runAsUser: 65532
-          capabilities: {drop: [ALL]}
-    config:
-      service: |
-        [Service]
-          Flush         1
-          Log_Level     info
-          Daemon        off
-          Parsers_File  parsers.conf
-          HTTP_Server   On
-          HTTP_Listen   0.0.0.0
-          HTTP_Port     2020
-      inputs: |
-        [INPUT]
-          Name                tail
-          Path                /var/log/containers/*.log
-          Parser              cri
-          DB                  /var/log/flb_kube.db
-          Tag                 kube.*
-          Mem_Buf_Limit       5MB
-          Skip_Long_Lines     On
-          Refresh_Interval    10
-      filters: |
-        [FILTER]
-          Name                kubernetes
-          Match               kube.*
-          Kube_URL            https://kubernetes.default.svc:443
-          Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
-          Kube_Tag_Prefix     kube.var.log.containers.
-          Merge_Log           On
-          Merge_Log_Key       log_processed
-          K8S-Logging.Parser  On
-          K8S-Logging.Exclude On
-
-        [FILTER]
-          Name                record_modifier
-          Match               *
-          Record tkg_instance k8s-talos
-          Record tkg_cluster  k8s-talos
-
-        [FILTER]
-          Name                modify
-          Match               kube.*
-          Copy                kubernetes k8s
-
-        [FILTER]
-          Name                record_modifier
-          Match               kube.*
-          Record              node_name $${HOSTNAME}
-
-        [FILTER]
-          Name                nest
-          Match               kube.*
-          Operation           lift
-          Nested_Under        kubernetes
-      outputs: |
-        # CFAPI HTTPS ingest via the vcflogs-cfapi-adapter sidecar
-        # (see extraContainers above). Replaces the previous syslog
-        # output, which hit the RFC 5424 2048-byte per-message cap
-        # and truncated long Plex Web Request: lines mid-token —
-        # losing X-Plex-Product / X-Plex-Version / X-Plex-Token from
-        # the tail when X-Plex-Client-Profile-Extra (1.5KB typical)
-        # appeared earlier in the header list.
-        #
-        # Format json + json_date_format epoch_ms produce the shape
-        # the adapter expects: top-level JSON array of records, each
-        # with a numeric `timestamp` key in epoch ms. The adapter
-        # reshapes per-record into CFAPI's
-        # {"events":[{"text":...,"timestamp":...,"fields":[...]}]}
-        # wrapper and POSTs to skw-vcflogs CFAPI.
-        [OUTPUT]
-          Name              http
-          Match             kube.*
-          Host              127.0.0.1
-          Port              8080
-          URI               /
-          Format            json
-          json_date_key     timestamp
-          json_date_format  epoch_ms
-          tls               Off
-          # Loopback to the sidecar — retries are cheap; let
-          # fluent-bit drive the back-pressure rather than blocking
-          # the input thread on a long upstream POST.
-          Retry_Limit       5
-      customParsers: |
-        [PARSER]
-          Name                 cri
-          Format               regex
-          Regex                ^(?<time>[^ ]+) (?<stream>stdout|stderr) (?<logtag>[^ ]*) (?<message>.*)$
-          Time_Key             time
-          Time_Format          %Y-%m-%dT%H:%M:%S.%L%z
diff --git a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/kustomization.yaml b/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/kustomization.yaml
deleted file mode 100644
index 0370974a..00000000
--- a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/kustomization.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-resources:
-  - namespace.yaml
-  - helmrelease.yaml
diff --git a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/namespace.yaml b/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/namespace.yaml
deleted file mode 100644
index 65e4e91d..00000000
--- a/cluster-talos/kubernetes/infrastructure/platform/tanzu-system-logging/app/namespace.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: tanzu-system-logging
-  labels:
-    pod-security.kubernetes.io/enforce: privileged
-    pod-security.kubernetes.io/audit: privileged
-    pod-security.kubernetes.io/warn: privileged