From 8730677c26a737ca91bdc93eb826ffb8e34b7476 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 26 Mar 2026 15:52:29 +0200 Subject: [PATCH 01/21] Add tiered storage cache as a separate PVC --- api/v1beta1/kafkacluster_types.go | 21 ++-- api/v1beta1/kafkacluster_types_test.go | 18 ++++ pkg/resources/cruisecontrol/configmap.go | 7 ++ pkg/resources/cruisecontrol/configmap_test.go | 101 ++++++++++++++++++ 4 files changed, 140 insertions(+), 7 deletions(-) diff --git a/api/v1beta1/kafkacluster_types.go b/api/v1beta1/kafkacluster_types.go index 7ccae1cee..d636c0570 100644 --- a/api/v1beta1/kafkacluster_types.go +++ b/api/v1beta1/kafkacluster_types.go @@ -518,6 +518,12 @@ type StorageConfig struct { // the `pvcSpec` is used by default. // +optional EmptyDir *corev1.EmptyDirVolumeSource `json:"emptyDir,omitempty"` + + // TieredStorageCache indicates this storage is used for Kafka tiered storage cache + // (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + // excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + // +optional + TieredStorageCache bool `json:"tieredStorageCache,omitempty"` } // ListenersConfig defines the Kafka listener types @@ -947,16 +953,17 @@ func (bConfig *BrokerConfig) GetTerminationGracePeriod() int64 { } // GetStorageMountPaths returns a string with comma-separated storage mount paths that the broker uses +// for Kafka log.dirs. Tiered storage cache volumes are excluded. func (bConfig *BrokerConfig) GetStorageMountPaths() string { - var mountPaths string - for i, sc := range bConfig.StorageConfigs { - if i != len(bConfig.StorageConfigs)-1 { - mountPaths += sc.MountPath + "," - } else { - mountPaths += sc.MountPath + var mountPaths []string + for _, sc := range bConfig.StorageConfigs { + // Skip tiered storage cache volumes - they should not be in log.dirs + if sc.TieredStorageCache { + continue } + mountPaths = append(mountPaths, sc.MountPath) } - return mountPaths + return strings.Join(mountPaths, ",") } // GetNodeSelector returns the node selector for cruise control diff --git a/api/v1beta1/kafkacluster_types_test.go b/api/v1beta1/kafkacluster_types_test.go index 38a7be38e..10c3c5362 100644 --- a/api/v1beta1/kafkacluster_types_test.go +++ b/api/v1beta1/kafkacluster_types_test.go @@ -622,6 +622,24 @@ func TestGetStorageMountPaths(t *testing.T) { }, expectedMountPaths: "test-log-1,test-log-2,test-log-3,test-log-4,test-log-5", }, + { + testName: "BrokerConfig with tiered storage cache should exclude it from mount paths", + brokerConfig: &BrokerConfig{ + StorageConfigs: []StorageConfig{ + { + MountPath: "test-log-1", + }, + { + MountPath: "test-log-2", + }, + { + MountPath: "/tiered-storage-cache", + TieredStorageCache: true, + }, + }, + }, + expectedMountPaths: "test-log-1,test-log-2", + }, } for _, test := range testCases { diff --git a/pkg/resources/cruisecontrol/configmap.go b/pkg/resources/cruisecontrol/configmap.go index 20735de89..10d079d2f 100644 --- a/pkg/resources/cruisecontrol/configmap.go +++ b/pkg/resources/cruisecontrol/configmap.go @@ -346,6 +346,13 @@ func generateBrokerDisks(brokerState v1beta1.Broker, kafkaClusterSpec v1beta1.Ka // Generate log dir configuration logDirs := make(map[string]string, len(storageConfigs)) for path, conf := range storageConfigs { + // Skip tiered storage cache volumes - they should not be included in Cruise Control capacity + if conf.TieredStorageCache { + log.V(1).Info("skipping tiered storage cache volume from Cruise Control capacity", + v1beta1.BrokerIdLabelKey, brokerState.Id, "mountPath", path) + continue + } + size := parseMountPathWithSize(conf) log.V(1).Info(fmt.Sprintf("broker log.dir %s size in MB: %d", path, size), v1beta1.BrokerIdLabelKey, brokerState.Id) diff --git a/pkg/resources/cruisecontrol/configmap_test.go b/pkg/resources/cruisecontrol/configmap_test.go index 89ec08f27..5e2420ed5 100644 --- a/pkg/resources/cruisecontrol/configmap_test.go +++ b/pkg/resources/cruisecontrol/configmap_test.go @@ -1017,4 +1017,105 @@ func TestGenerateCapacityConfigWithUserProvidedInput(t *testing.T) { } }) } + + // Test tiered storage cache exclusion from capacity + t.Run("tiered storage cache should be excluded from capacity", func(t *testing.T) { + quantity, _ := resource.ParseQuantity("100Gi") + tieredCacheQuantity, _ := resource.ParseQuantity("50Gi") + cpuQuantity, _ := resource.ParseQuantity("2000m") + + kafkaCluster := v1beta1.KafkaCluster{ + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{ + { + Id: 0, + BrokerConfig: &v1beta1.BrokerConfig{ + Resources: &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + "cpu": cpuQuantity, + }, + }, + StorageConfigs: []v1beta1.StorageConfig{ + { + MountPath: "/kafka-logs", + PvcSpec: &v1.PersistentVolumeClaimSpec{ + Resources: v1.VolumeResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: quantity, + }, + }, + }, + }, + { + MountPath: "/tiered-storage-cache", + TieredStorageCache: true, + PvcSpec: &v1.PersistentVolumeClaimSpec{ + Resources: v1.VolumeResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceStorage: tieredCacheQuantity, + }, + }, + }, + }, + }, + }, + }, + }, + }, + Status: v1beta1.KafkaClusterStatus{ + BrokersState: map[string]v1beta1.BrokerState{ + "0": {}, + }, + }, + } + + expectedConfiguration := `{ + "brokerCapacities": [ + { + "brokerId": "0", + "capacity": { + "DISK": { + "/kafka-logs/kafka": "107374" + }, + "CPU": "200", + "NW_IN": "125000", + "NW_OUT": "125000" + }, + "doc": "Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB." + } + ] +}` + + var actual JBODInvariantCapacityConfig + rawStringActual, err := GenerateCapacityConfig(&kafkaCluster, logr.Discard(), nil) + if err != nil { + t.Error(err, "could not generate capacity config") + } + err = json.Unmarshal([]byte(rawStringActual), &actual) + if err != nil { + t.Error(err, "could not unmarshal actual json") + } + + var expected JBODInvariantCapacityConfig + err = json.Unmarshal([]byte(expectedConfiguration), &expected) + if err != nil { + t.Error(err, "could not unmarshal expected json") + } + + if !reflect.DeepEqual(actual, expected) { + t.Errorf("Expected: %+v, got: %+v", expected, actual) + } + + // Verify that tiered storage cache is NOT in the capacity + actualCapacity := actual.Capacities[0].(map[string]interface{}) + diskCapacity := actualCapacity["capacity"].(map[string]interface{})["DISK"].(map[string]interface{}) + + if _, exists := diskCapacity["/tiered-storage-cache/kafka"]; exists { + t.Error("Tiered storage cache should not be in Cruise Control capacity") + } + + if _, exists := diskCapacity["/kafka-logs/kafka"]; !exists { + t.Error("Regular storage should be in Cruise Control capacity") + } + }) } From 710e7aedd6f018b3a7a0089b5adf55ae335019bd Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 1 Apr 2026 12:28:52 +0300 Subject: [PATCH 02/21] initial commit --- charts/kafka-operator/crds/kafkaclusters.yaml | 12 + .../kafka.banzaicloud.io_kafkaclusters.yaml | 12 + .../samples/kafkacluster_tiered_storage.yaml | 147 +++++++ ...r_tiered_storage_cache_resize_initial.yaml | 98 +++++ ...er_tiered_storage_cache_resize_shrunk.yaml | 98 +++++ docs/tiered-storage-pvc-resize.md | 302 +++++++++++++++ pkg/resources/kafka/kafka.go | 237 +++++++++++- pkg/resources/kafka/kafka_test.go | 234 +++++++++++ pkg/resources/kafka/pvc.go | 9 +- tests/e2e/test_tiered_storage_cache_resize.go | 364 ++++++++++++++++++ 10 files changed, 1506 insertions(+), 7 deletions(-) create mode 100644 config/samples/kafkacluster_tiered_storage.yaml create mode 100644 config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml create mode 100644 config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml create mode 100644 docs/tiered-storage-pvc-resize.md create mode 100644 tests/e2e/test_tiered_storage_cache_resize.go diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index 7abba845c..4e09f60f5 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -5101,6 +5101,12 @@ spec: the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object @@ -12144,6 +12150,12 @@ spec: to the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index 7abba845c..901a688c8 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -5101,6 +5101,12 @@ spec: the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object @@ -12144,6 +12150,12 @@ spec: to the PersistentVolume backing this claim. type: string type: object + tieredStorageCache: + description: |- + TieredStorageCache indicates this storage is used for Kafka tiered storage cache + (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be + excluded from Cruise Control capacity calculations and will not be used as a Kafka log.dir. + type: boolean required: - mountPath type: object diff --git a/config/samples/kafkacluster_tiered_storage.yaml b/config/samples/kafkacluster_tiered_storage.yaml new file mode 100644 index 000000000..e79c5b72b --- /dev/null +++ b/config/samples/kafkacluster_tiered_storage.yaml @@ -0,0 +1,147 @@ +apiVersion: kafka.banzaicloud.io/v1beta1 +kind: KafkaCluster +metadata: + labels: + controller-tools.k8s.io: "1.0" + name: kafka-tiered-storage +spec: + kRaft: true + headlessServiceEnabled: true + oneBrokerPerNode: false + clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" + + # Cluster-wide Kafka configuration including tiered storage settings + readOnlyConfig: | + auto.create.topics.enable=false + cruise.control.metrics.topic.auto.create=true + cruise.control.metrics.topic.num.partitions=1 + cruise.control.metrics.topic.replication.factor=2 + + # Tiered Storage Configuration + # Enable remote storage + remote.log.storage.system.enable=true + + # Remote storage manager class implementation + remote.log.storage.manager.class.name=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager + + # Remote log metadata manager class implementation + remote.log.metadata.manager.class.name=org.apache.kafka.server.log.remote.metadata.storage.TopicBasedRemoteLogMetadataManager + + # Remote storage manager configuration prefix + rsm.config.remote.storage.manager.impl.class=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager + + # Tiered storage cache configuration - this path matches the tieredStorageCache volume mount + rsm.config.fetch.chunk.cache.path=/tiered-storage-cache + + brokers: + - id: 0 + brokerConfig: + # Process roles for KRaft mode + processRoles: + - broker + + # Storage configurations + storageConfigs: + # Primary Kafka log storage - included in Cruise Control capacity + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + + # Tiered storage cache - excluded from Cruise Control capacity and log.dirs + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + # Can use a different storage class optimized for cache (e.g., faster SSD) + # storageClassName: fast-ssd + resources: + requests: + storage: 50Gi + + resourceRequirements: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + + - id: 1 + brokerConfig: + processRoles: + - broker + + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + + resourceRequirements: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + + - id: 2 + brokerConfig: + processRoles: + - broker + + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + + resourceRequirements: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "1" + memory: 2Gi + + listenersConfig: + internalListeners: + - type: "plaintext" + name: "internal" + containerPort: 29092 + usedForInnerBrokerCommunication: true + - type: "plaintext" + name: "controller" + containerPort: 29093 + usedForInnerBrokerCommunication: false + usedForControllerCommunication: true + diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml new file mode 100644 index 000000000..d6cae7a43 --- /dev/null +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml @@ -0,0 +1,98 @@ +apiVersion: kafka.banzaicloud.io/v1beta1 +kind: KafkaCluster +metadata: + name: kafka-ts-resize + namespace: kafka +spec: + kRaft: true + headlessServiceEnabled: true + oneBrokerPerNode: false + clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" + readOnlyConfig: | + auto.create.topics.enable=false + rollingUpgradeConfig: + failureThreshold: 1 + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 + brokers: + - id: 0 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 1 + brokerConfig: + processRoles: + - controller + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + - id: 2 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + listenersConfig: + internalListeners: + - type: "plaintext" + name: "internal" + containerPort: 29092 + usedForInnerBrokerCommunication: true + - type: "plaintext" + name: "controller" + containerPort: 29093 + usedForInnerBrokerCommunication: false + usedForControllerCommunication: true diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml new file mode 100644 index 000000000..88a7fc27d --- /dev/null +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml @@ -0,0 +1,98 @@ +apiVersion: kafka.banzaicloud.io/v1beta1 +kind: KafkaCluster +metadata: + name: kafka-ts-resize + namespace: kafka +spec: + kRaft: true + headlessServiceEnabled: true + oneBrokerPerNode: false + clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" + readOnlyConfig: | + auto.create.topics.enable=false + rollingUpgradeConfig: + failureThreshold: 1 + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 + brokers: + - id: 0 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + - mountPath: "/tiered-storage-cache" + tieredStorageCache: true + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi # Reduced from 2Gi to trigger delete-and-recreate + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 1 + brokerConfig: + processRoles: + - controller + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + - id: 2 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "500m" + memory: 1Gi + requests: + cpu: "100m" + memory: 512Mi + listenersConfig: + internalListeners: + - type: "plaintext" + name: "internal" + containerPort: 29092 + usedForInnerBrokerCommunication: true + - type: "plaintext" + name: "controller" + containerPort: 29093 + usedForInnerBrokerCommunication: false + usedForControllerCommunication: true diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md new file mode 100644 index 000000000..7336360f4 --- /dev/null +++ b/docs/tiered-storage-pvc-resize.md @@ -0,0 +1,302 @@ +# Tiered Storage Cache PVC Resize: Reconciliation Flow + +This document explains how koperator handles resizing (shrinking) a tiered storage +cache PVC, why the flow is more involved than a regular PVC update, and the +design decisions behind the current implementation. + +--- + +## Background: why cache PVCs are different + +Kubernetes does not allow shrinking a PVC — the only supported in-place change is +growing it. For regular Kafka log volumes koperator enforces the same constraint +(`isDesiredStorageValueInvalid` returns an error on any size decrease). + +Tiered storage cache volumes are different: the data they hold is ephemeral. When a +broker restarts it will repopulate the cache from remote storage. Losing the cache +does not cause data loss; it only causes a temporary increase in read latency. +Because of this koperator implements a **delete-and-recreate** strategy for cache +PVC shrinks — but it must do so safely, coordinating with the rolling upgrade +machinery so that only one broker is down at a time and cluster health gates are +respected. + +--- + +## Components involved + +| Component | File | Role | +|-----------|------|------| +| `reconcileKafkaPvc` | `pkg/resources/kafka/kafka.go` | Creates / updates / deletes PVCs | +| `reconcileKafkaPod` | `pkg/resources/kafka/kafka.go` | Creates pods or hands off to rolling upgrade | +| `handleRollingUpgrade` | `pkg/resources/kafka/kafka.go` | Deletes a pod after checking health gates | +| `getCreatedPvcForBroker` | `pkg/resources/kafka/kafka.go` | Looks up which PVCs exist for a broker and feeds them into the pod spec | +| `generateDataVolumeAndVolumeMount` | `pkg/resources/kafka/pod.go` | Translates PVC list → pod Volume + VolumeMount entries using `ClaimName` | + +--- + +## PVC annotations used during resize + +Two annotations are written directly on PVC objects to carry state through the +resize. Because annotations live on the Kubernetes object they survive reconciler +restarts, making every step re-entrant. + +| Annotation | Value | Written on | Meaning | +|------------|-------|------------|---------| +| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | This PVC is being replaced; skip it in pod spec generation and delete it once the broker pod stops | +| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | This is the replacement PVC; defer to the rolling upgrade before treating it as normal | +| `koperator.adobe.com/replaces-pvc` | `` | New PVC | Traceability — records which PVC is being replaced | + +--- + +## Reconciliation loop order + +Within a single `Reconcile()` call the kafka sub-reconciler runs in this order: + +``` +1. Services +2. PodDisruptionBudgets +3. reconcileKafkaPodDelete ← graceful downscale via Cruise Control +4. Listener statuses / PKI +5. reconcileKafkaPvc ← ALL PVC work happens here, before any pod work +6. Discover running pods + bound PVCs +7. reorderBrokers ← priority sort +8. FOR each broker: + a. ConfigMap + b. per-broker Service + c. reconcileKafkaPod ← pod create OR handleRollingUpgrade + d. reconcilePerBrokerDynamicConfig +9. reconcileClusterWideDynamicConfig +``` + +PVC reconciliation always runs before pod reconciliation. The new PVC is therefore +created (and starts provisioning) before `handleRollingUpgrade` is evaluated for +the same broker in the same cycle. + +--- + +## Full flow: shrinking a tiered storage cache PVC + +### Cycle N — resize detected (pod running) + +**`reconcileKafkaPvc` — per-broker setup** + +``` +r.List(brokerPodList) +brokerPodExists = true + +No pending-deletion PVCs exist yet → no cleanup +No replacement PVCs exist yet → no resize-complete strip +effectivePvcCount == len(desiredPvcs) → no CC disk removal triggered +``` + +**`reconcileKafkaPvc` — per-desired-PVC loop** + +``` +CheckIfObjectUpdated: currentSize > desiredSize → enters resize branch + + 1. Annotate old PVC: + koperator.adobe.com/cache-resize-state: pending-deletion + (r.Update — durable immediately in etcd) + + 2. Create replacement PVC with desiredSize: + koperator.adobe.com/cache-resize-state: replacement + koperator.adobe.com/replaces-pvc: + (r.Create — provisioning starts now, in parallel with gate evaluation) + + 3. Set broker ConfigurationState = ConfigOutOfSync + (triggers handleRollingUpgrade on every cycle until pod restarts) + + continue +``` + +**`reconcileKafkaPod` — same cycle** + +`handleRollingUpgrade` sees `ConfigOutOfSync`, evaluates gates: + +| Gate | Required to pass | +|------|-----------------| +| Pod count | All expected pods exist | +| Concurrent restart limit | Terminating/Pending pods < `ConcurrentBrokerRestartCountPerRack` | +| Rack awareness | Only pods from same AZ as restarting pods | +| Replica health | `offline + out-of-sync < FailureThreshold` | + +If all pass → **broker pod is deleted** → requeue 15 s. +If any fail → requeue 15 s, try again next cycle (state is fully durable via PVC annotations). + +--- + +### Between cycles (rolling upgrade gates blocking) + +Each reconcile cycle sees: + +``` +reconcileKafkaPvc: + brokerPodExists = true + No pending-deletion PVCs → no cleanup + Replacement PVC exists, no pending-deletion → resize-complete strip + … but pod is still up → strip does NOT fire (pod must be up AND no pending-deletion) + alreadyCreated loop: skips pending-deletion PVC, finds replacement PVC + replacement PVC guard: ensure ConfigOutOfSync, continue + +reconcileKafkaPod: + handleRollingUpgrade re-evaluates gates → requeue if blocked +``` + +State is preserved entirely in PVC annotations — no in-memory or status-field +bookkeeping required. Reconciler can crash and restart at any point. + +--- + +### Cycle N+1 — pod is gone + +**`reconcileKafkaPvc` — per-broker setup** + +``` +r.List(brokerPodList) +brokerPodExists = false + +Cleanup loop: finds PVC with pending-deletion annotation + → r.Delete(oldPvc) ← safe now, broker is not running + → r.List(pvcList) ← re-list; only replacement PVC remains +``` + +**`getCreatedPvcForBroker` (called later to build pod spec)** + +Filters out any PVC with `cache-resize-state: pending-deletion` before matching +mount paths. Returns the replacement PVC for the mount path. The pod spec is built +with `ClaimName` pointing to the new PVC. + +**`reconcileKafkaPod`** + +No pod exists → creates new pod referencing the replacement PVC. Kubernetes holds +the pod in `Pending` until the replacement PVC reaches `Bound` phase. + +Because provisioning started in cycle N (when the replacement PVC was created), +the PVC is likely already `Bound` by now, minimising pod startup latency. + +--- + +### Cycle N+2 — pod is running again + +**`reconcileKafkaPvc` — resize-complete strip** + +``` +brokerPodExists = true +No pending-deletion PVCs (deleted in N+1) +Replacement PVC exists → resize complete → strip annotations: + delete koperator.adobe.com/cache-resize-state + delete koperator.adobe.com/replaces-pvc + r.Update(replacementPvc) +``` + +The PVC is now an ordinary PVC. Subsequent reconciles treat it normally. + +--- + +## Sequence diagram + +``` +Cycle N (pod UP, resize detected) + reconcileKafkaPvc + ├─ r.Update(oldPvc) annotate pending-deletion + ├─ r.Create(newPvc) replacement PVC, provisioning starts + └─ ConfigOutOfSync set + reconcileKafkaPod + └─ handleRollingUpgrade + ├─ [gates fail] → requeue 15s + └─ [gates pass] → delete pod → requeue 15s + +Cycle N+k (pod UP, gates failing — any number of cycles) + reconcileKafkaPvc + └─ replacement PVC guard: ensure ConfigOutOfSync, continue + reconcileKafkaPod + └─ handleRollingUpgrade → requeue + +Cycle N+k+1 (pod GONE) + reconcileKafkaPvc + ├─ delete pending-deletion PVC + └─ re-list: only replacement PVC remains + reconcileKafkaPod + └─ no pod → create pod (ClaimName = replacement PVC) + └─ pod Pending until PVC bound (likely already bound) + +Cycle N+k+2 (pod RUNNING) + reconcileKafkaPvc + └─ no pending-deletion PVC + replacement PVC exists → strip annotations + └─ replacement PVC becomes ordinary PVC +``` + +--- + +## Properties of this design + +| Property | Value | +|----------|-------| +| State survives reconciler crash | Yes — PVC annotations are durable in etcd | +| Idempotent re-entry | Yes — each phase checks annotations and resumes correctly | +| Atomicity gap | Eliminated — new PVC is created before old is deleted | +| Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 | +| Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly | +| ConfigOutOfSync overloading | Reduced — `ConfigOutOfSync` still used, but the *reason* is legible in PVC annotations | +| CC disk rebalance for cache PVCs | Fixed — tiered cache PVCs are explicitly excluded from `GracefulDiskRebalanceRequired` logic | + +--- + +## Interaction with disk removal detection + +`reconcileKafkaPvc` detects regular disk removal by comparing PVC counts: + +```go +if effectivePvcCount > len(desiredPvcs) { handleDiskRemoval(...) } +``` + +The count uses `effectivePvcCount` which **excludes replacement PVCs**. During a +resize the old (pending-deletion) + new (replacement) PVCs temporarily co-exist for +the same mount path. Without this exclusion the count check would incorrectly +trigger a Cruise Control disk-removal operation. + +Inside `handleDiskRemoval` itself, the `pending-deletion` PVC is also safe: its +mount path still matches the desired spec, so `foundInDesired = true` and it is +skipped by the disk-removal state machine. + +--- + +## Interaction with GracefulDiskRebalanceRequired + +When a new PVC first becomes `Bound`, koperator normally sets +`GracefulDiskRebalanceRequired` to ask Cruise Control to rebalance data across +the broker's disks. This is correct for Kafka log volumes but wrong for tiered +storage cache volumes — CC must not account for ephemeral cache storage. + +The `alreadyCreated` loop now explicitly skips `GracefulDiskRebalanceRequired` for +any PVC annotated `tieredStorageCache: true`, including the replacement PVC. + +--- + +## Known limitations + +### ConfigOutOfSync still shared with config changes + +`ConfigOutOfSync` is set to trigger the rolling upgrade. It is the same bit used +for Kafka property changes. An observer cannot distinguish "resize pending" from +"config change" by status alone — the PVC annotations must be inspected. + +### Concurrent resize + complete storage-config removal + +If the tiered storage cache storage config is removed entirely from the spec while +a resize is in progress, both the `pending-deletion` and `replacement` PVCs have a +mount path that is no longer present in the desired spec. Without special handling +this would route them into the Cruise Control `remove_disks` path — which fails with +"log dir not found" because CC has no knowledge of ephemeral cache paths — leaving +the operator stuck in a 20-second requeue loop. + +koperator handles this as follows: + +**Pod DOWN**: both the `pending-deletion` and the orphaned `replacement` PVC are +deleted directly in `reconcileKafkaPvc` (bypassing Cruise Control). The operator +proceeds normally on the next cycle. + +**Pod UP**: koperator sets `ConfigOutOfSync` to trigger a rolling restart via +`handleRollingUpgrade`, then skips `handleDiskRemoval` for this broker entirely +(`continue` in the broker loop). Once the pod stops, the next cycle falls into the +Pod DOWN path above and cleans up both PVCs. diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index e2a5c1168..243ededf5 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -67,6 +67,14 @@ const ( brokerConfigTemplate = "%s-config" brokerStorageTemplate = "%s-%d-storage-%d-" + // pvcCacheResizeStateAnnotation is set on PVCs during a tiered storage cache resize to track + // which PVC is the old one awaiting deletion and which is the new replacement. + pvcCacheResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" + pvcCacheResizePendingDeletion = "pending-deletion" + pvcCacheResizeReplacement = "replacement" + // pvcCacheResizeReplacesPVC records the name of the old PVC that the replacement supersedes. + pvcCacheResizeReplacesPVC = "koperator.adobe.com/replaces-pvc" + brokerConfigMapVolumeMount = "broker-config" kafkaDataVolumeMount = "kafka-data" @@ -158,6 +166,17 @@ func getCreatedPvcForBroker( return nil, err } + // Exclude PVCs that are pending deletion during a tiered storage cache resize. + // The replacement PVC (same mount path, smaller size) is already present and will be used instead. + n := 0 + for _, pvc := range foundPvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizePendingDeletion { + foundPvcList.Items[n] = pvc + n++ + } + } + foundPvcList.Items = foundPvcList.Items[:n] + var missing []string for i := range storageConfigs { if storageConfigs[i].PvcSpec == nil { @@ -317,6 +336,12 @@ func (r *Reconciler) Reconcile(log logr.Logger) error { if storage.PvcSpec == nil { continue } + // Tiered storage cache PVCs are only meaningful on broker nodes. + // Controller-only nodes do not serve Kafka client traffic or hold tiered-storage + // segment data, so skip cache PVCs for them. + if storage.TieredStorageCache && brokerConfig.IsControllerNode() && !brokerConfig.IsBrokerNode() { + continue + } o, err := r.pvc(broker.Id, index, storage) if err != nil { return errors.WrapIfWithDetails(err, "failed to generate resource", "resources", "PersistentVolumeClaim") @@ -1184,8 +1209,130 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro } } - // Handle disk removal - if len(pvcList.Items) > len(desiredPvcs) { + // Resolve pod existence once per broker — used by both cleanup and resize logic below. + // A pod with a non-nil DeletionTimestamp is Terminating: its containers have stopped (or + // are stopping) and any PVCs it mounted will be released once the pod is fully gone. + // Treating a Terminating pod as "not existing" lets the pending-deletion PVC be cleaned up + // during the Terminating window rather than waiting for the narrow gap between the pod being + // fully removed from etcd and the new pod being created in the same reconcile cycle. + brokerPodList := &corev1.PodList{} + brokerPodMatchingLabels := client.MatchingLabels( + apiutil.MergeLabels( + apiutil.LabelsForKafka(r.KafkaCluster.Name), + map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, + ), + ) + if err := r.List(ctx, brokerPodList, client.InNamespace(r.KafkaCluster.GetNamespace()), brokerPodMatchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting broker pods failed", "brokerId", brokerId) + } + brokerPodExists := false + for i := range brokerPodList.Items { + if brokerPodList.Items[i].DeletionTimestamp == nil { + brokerPodExists = true + break + } + } + + // Build a set of desired mount paths for orphaned-PVC detection below. + desiredMountPaths := make(map[string]bool, len(desiredPvcs)) + for _, dp := range desiredPvcs { + desiredMountPaths[dp.Annotations["mountPath"]] = true + } + + if brokerPodExists { + // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: + // strip the replacement annotation so the PVC is treated as normal going forward. + var hasPendingDeletion bool + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + hasPendingDeletion = true + break + } + } + if !hasPendingDeletion { + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + pvcCopy := pvc.DeepCopy() + delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) + delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) + if err := r.Update(ctx, pvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) + } + log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) + } + } + } + } else { + // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + // Also delete replacement PVCs whose mount path is no longer desired — these are + // orphaned by a storage config removal that happened while a cache resize was in flight. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + // Re-list so the rest of this iteration sees the current state. + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + } + + // If the pod is still running and there are orphaned cache resize PVCs (storage config + // removed while a resize was in flight), trigger a rolling restart so the pod stops + // and we can delete the orphaned PVCs on the next cycle. Routing these through + // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. + if brokerPodExists { + var hasOrphanedCachePvc bool + for _, pvc := range pvcList.Items { + resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] + if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + hasOrphanedCachePvc = true + break + } + } + if hasOrphanedCachePvc { + log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", + "brokerId", brokerId) + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) + } + } + continue // Skip disk removal — wait for pod to stop before deleting cache PVCs. + } + } + + // Handle disk removal — count only non-replacement PVCs to avoid false positives while a + // tiered storage cache resize is in flight (old + new PVC temporarily coexist). + effectivePvcCount := 0 + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizeReplacement { + effectivePvcCount++ + } + } + if effectivePvcCount > len(desiredPvcs) { waitForDiskRemovalToFinish, err = handleDiskRemoval(ctx, pvcList, desiredPvcs, r, brokerId, log, desiredType, brokerVolumesState) if err != nil { return err @@ -1217,9 +1364,20 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro alreadyCreated := false for _, pvc := range pvcList.Items { - if mountPath == pvc.Annotations[mountPathAnnotationKey] { - currentPvc = pvc.DeepCopy() - alreadyCreated = true + if mountPath != pvc.Annotations[mountPathAnnotationKey] { + continue + } + // Skip the old PVC that is waiting to be deleted once the broker pod stops. + // The replacement PVC (same mount path) will be matched on the next iteration. + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + continue + } + currentPvc = pvc.DeepCopy() + alreadyCreated = true + // Trigger a CC disk rebalance only for regular data volumes. + // Tiered storage cache PVCs are ephemeral — CC must not account for them. + isCachePvc := pvc.Annotations["tieredStorageCache"] == "true" + if !isCachePvc { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1228,8 +1386,8 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro (!found || volumeState.CruiseControlVolumeState.IsDiskRemoval()) { brokerVolumesState[mountPath] = banzaiv1beta1.VolumeState{CruiseControlVolumeState: banzaiv1beta1.GracefulDiskRebalanceRequired} } - break } + break } if !alreadyCreated { @@ -1243,11 +1401,78 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } if err == nil { + // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. + // It already has the right size; we only need to keep the rolling upgrade running so + // the broker pod stops and the pending-deletion PVC can be cleaned up. + if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for pending tiered storage cache PVC resize", "brokerId", brokerId) + } + } + continue + } + if k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { return errors.WrapIf(err, "could not apply last state to annotation") } + // Check if this is a tiered storage cache volume + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == "true" + desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() + + // Tiered storage cache PVC shrink: stage the replacement PVC immediately so + // provisioning runs in parallel with rolling-upgrade gate evaluation. + // The old PVC is annotated pending-deletion and removed once the broker pod stops. + if isTieredCache && desiredSize < currentSize { + log.Info("Tiered storage cache size decrease detected — staging replacement PVC", + "brokerId", brokerId, + "mountPath", mountPath, + "currentSize", currentSize, + "desiredSize", desiredSize) + + // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up + // the moment the broker pod stops. + oldPvcCopy := currentPvc.DeepCopy() + oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + if err := r.Update(ctx, oldPvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) + } + log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + + // Create the replacement PVC immediately so provisioning starts now. + desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement + desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) + } + log.Info("Created replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", desiredPvc.Name, "desiredSize", desiredSize) + + // Trigger rolling upgrade so the broker pod is restarted and the old PVC + // can be deleted on the next reconcile cycle. + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + log.Info("Marking broker ConfigOutOfSync to trigger rolling upgrade for tiered storage cache PVC resize", + "brokerId", brokerId) + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for tiered storage cache PVC resize", "brokerId", brokerId) + } + } + continue + } + + // Regular validation: size decreases are forbidden for non-cache volumes. if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), "one can not reduce the size of a PVC", "kind", desiredType) diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index ad9e6db4b..33093680a 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -32,8 +32,10 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + "github.com/banzaicloud/k8s-objectmatcher/patch" "github.com/banzaicloud/koperator/api/v1alpha1" "github.com/banzaicloud/koperator/api/v1beta1" controllerMocks "github.com/banzaicloud/koperator/controllers/tests/mocks" @@ -1257,6 +1259,15 @@ func execPvcTest(t *testing.T, testCases []PvcTestCase) { }, } + // Mock pod list — the new code always checks pod existence before disk removal. + // For these tests the broker pod is always absent. + mockClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PodList{}), + client.InNamespace("kafka"), + gomock.Any(), + ).Return(nil).AnyTimes() + // Set up the mockClient to return the provided test.existingPvcs mockClient.EXPECT().List( context.TODO(), @@ -1336,6 +1347,229 @@ func createPvc(name, brokerId, mountPath string) *corev1.PersistentVolumeClaim { } } +func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { + t.Parallel() + + const ( + clusterName = "kafka" + namespace = "kafka" + brokerId = "0" + mountPath = "/tiered-storage-cache" + ) + + makeTieredCachePvc := func(name, size string) *corev1.PersistentVolumeClaim { + qty := resource.MustParse(size) + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + v1beta1.BrokerIdLabelKey: brokerId, + v1beta1.AppLabelKey: "kafka", + v1beta1.KafkaCRLabelKey: clusterName, + }, + Annotations: map[string]string{ + "mountPath": mountPath, + "tieredStorageCache": "true", + }, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: qty, + }, + }, + }, + Status: corev1.PersistentVolumeClaimStatus{Phase: corev1.ClaimBound}, + } + // Set last-applied annotation so CheckIfObjectUpdated detects size differences + _ = patch.DefaultAnnotator.SetLastAppliedAnnotation(pvc) + return pvc + } + + runningPod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kafka-0", + Namespace: namespace, + }, + } + terminatingPod := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "kafka-0", + Namespace: namespace, + DeletionTimestamp: &metav1.Time{Time: time.Now()}, + }, + } + + testCases := []struct { + testName string + existingPvc *corev1.PersistentVolumeClaim + desiredPvc *corev1.PersistentVolumeClaim + existingPods []corev1.Pod + // expectedUpdatePvc: old PVC annotated pending-deletion, or annotation stripped on resize-complete + expectedUpdatePvc bool + expectedCreatePvc bool + expectedDeletePvc bool + expectedError bool + }{ + { + // Pod is up: annotate old PVC as pending-deletion, create replacement PVC, + // set ConfigOutOfSync to trigger rolling upgrade. No error — state is durable. + testName: "size decrease with running pod — old PVC annotated, replacement PVC created", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + expectedUpdatePvc: true, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Terminating pod is treated as having no running pod: staging starts immediately + // so the replacement PVC is provisioned during the drain window. + testName: "size decrease with terminating pod — old PVC annotated, replacement PVC created", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{terminatingPod}, + expectedUpdatePvc: true, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Pod is terminating and old PVC already has pending-deletion annotation: + // the Terminating pod is treated as gone, so cleanup fires immediately. + testName: "pending-deletion PVC with terminating pod — old PVC deleted", + existingPvc: func() *corev1.PersistentVolumeClaim { + pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") + pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + return pvc + }(), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{terminatingPod}, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, + }, + { + // Pod is already gone on first detection: stage the replacement anyway so the + // state machine is consistent. Cleanup of pending-deletion PVC happens on the + // next cycle once the annotation is observed. + testName: "size decrease with pod already gone — old PVC annotated, replacement PVC created", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{}, + expectedUpdatePvc: true, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Pod is gone and old PVC already has pending-deletion annotation: + // cleanup fires — old PVC is deleted. Replacement PVC already exists. + testName: "size decrease with pod gone and old PVC pending-deletion — old PVC deleted", + existingPvc: func() *corev1.PersistentVolumeClaim { + pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") + pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + return pvc + }(), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{}, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, + }, + { + // size increase — no special handling, regular PVC update path. + testName: "size increase — regular PVC update path", + existingPvc: makeTieredCachePvc("cache-pvc-1", "50Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + existingPods: []corev1.Pod{}, + expectedUpdatePvc: true, + expectedCreatePvc: false, + expectedDeletePvc: false, + expectedError: false, + }, + } + + mockCtrl := gomock.NewController(t) + + for _, test := range testCases { + test := test + t.Run(test.testName, func(t *testing.T) { + mockClient := mocks.NewMockClient(mockCtrl) + mockSubResourceClient := mocks.NewMockSubResourceClient(mockCtrl) + + r := Reconciler{ + Reconciler: resources.Reconciler{ + Client: mockClient, + KafkaCluster: &v1beta1.KafkaCluster{ + ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: namespace}, + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{ + Id: 0, + BrokerConfig: &v1beta1.BrokerConfig{Roles: []string{"broker"}}, + }}, + }, + }, + }, + } + + // Mock PVC list — always returns the existing PVC + mockClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaimList{}), + client.InNamespace(namespace), + gomock.Any(), + ).Do(func(ctx context.Context, list *corev1.PersistentVolumeClaimList, opts ...client.ListOption) { + list.Items = []corev1.PersistentVolumeClaim{*test.existingPvc} + }).Return(nil).AnyTimes() + + // Mock Pod list — returns the configured pods + mockClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PodList{}), + client.InNamespace(namespace), + gomock.Any(), + ).Do(func(ctx context.Context, list *corev1.PodList, opts ...client.ListOption) { + list.Items = test.existingPods + }).Return(nil).AnyTimes() + + // Mock PVC update (annotating pending-deletion or stripping replacement annotation) + if test.expectedUpdatePvc { + mockClient.EXPECT().Update(context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaim{})).Return(nil).AnyTimes() + } + + // Mock PVC deletion + if test.expectedDeletePvc { + mockClient.EXPECT().Delete(context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaim{})).Return(nil).Times(1) + } + + // Mock PVC creation + if test.expectedCreatePvc { + mockClient.EXPECT().Create(context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaim{})).Return(nil).Times(1) + } + + mockClient.EXPECT().Status().Return(mockSubResourceClient).AnyTimes() + mockSubResourceClient.EXPECT().Update(context.Background(), gomock.AssignableToTypeOf(&v1beta1.KafkaCluster{})).Return(nil).AnyTimes() + + brokersDesiredPvcs := map[string][]*corev1.PersistentVolumeClaim{ + brokerId: {test.desiredPvc}, + } + + err := r.reconcileKafkaPvc(context.TODO(), logf.Log, brokersDesiredPvcs) + + if test.expectedError { + assert.NotNil(t, err, "expected an error but got nil") + } else { + assert.Nil(t, err, "expected no error but got: %v", err) + } + }) + } +} + // nolint funlen func TestReconcileConcurrentBrokerRestartsAllowed(t *testing.T) { t.Parallel() diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index e0555a64d..f18e4febe 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -62,6 +62,13 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora return nil, errors.WrapIfWithDetails(err, "couldn't unmarshal Pvc spec", errCtx...) } + annotations := map[string]string{mountPathAnnotationKey: storage.MountPath} + + // Mark tiered storage cache PVCs with annotation for special handling + if storage.TieredStorageCache { + annotations["tieredStorageCache"] = "true" + } + return &corev1.PersistentVolumeClaim{ ObjectMeta: templates.ObjectMetaWithGeneratedNameAndAnnotations( fmt.Sprintf(brokerStorageTemplate, r.KafkaCluster.Name, brokerId, storageIndex), @@ -69,7 +76,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora apiutil.LabelsForKafka(r.KafkaCluster.Name), map[string]string{v1beta1.BrokerIdLabelKey: fmt.Sprintf("%d", brokerId)}, ), - map[string]string{mountPathAnnotationKey: storage.MountPath}, r.KafkaCluster), + annotations, r.KafkaCluster), Spec: pvcSpec, }, nil } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go new file mode 100644 index 000000000..0decedbc5 --- /dev/null +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -0,0 +1,364 @@ +// Copyright 2025 Adobe. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "emperror.dev/errors" + "github.com/gruntwork-io/terratest/modules/k8s" + ginkgo "github.com/onsi/ginkgo/v2" + gomega "github.com/onsi/gomega" +) + +const ( + tsResizeClusterName = "kafka-ts-resize" + + tsResizeInitialManifest = "../../config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml" + tsResizeShrunkManifest = "../../config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml" + + tsResizeCacheMountPath = "/tiered-storage-cache" + tsResizeInitialSize = "2Gi" + tsResizeShrunkSize = "1Gi" + + // Annotation keys written by the cache-resize reconciler. + pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" + pvcResizeStatePending = "pending-deletion" + pvcResizeStateReplacement = "replacement" + + tsResizePhaseTimeout = 10 * time.Minute + tsResizePollingInterval = 15 * time.Second +) + +// pvcItem is a minimal representation of a PVC for assertion helpers. +type pvcItem struct { + Name string + Annotations map[string]string + StorageSize string +} + +// listBrokerCachePVCs returns PVCs for the given broker that have the +// tiered-storage-cache mount path annotation. +func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) ([]pvcItem, error) { + selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, clusterName, brokerID) + + rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", "persistentvolumeclaims", + "-l", selector, + "--output", "json", + ) + if err != nil { + return nil, errors.WrapIf(err, "listing PVCs failed") + } + + var pvcList struct { + Items []struct { + Metadata struct { + Name string `json:"name"` + Annotations map[string]string `json:"annotations"` + } `json:"metadata"` + Spec struct { + Resources struct { + Requests struct { + Storage string `json:"storage"` + } `json:"requests"` + } `json:"resources"` + } `json:"spec"` + } `json:"items"` + } + if err := json.Unmarshal([]byte(rawOutput), &pvcList); err != nil { + return nil, errors.WrapIf(err, "parsing PVC list JSON failed") + } + + var result []pvcItem + for _, item := range pvcList.Items { + if item.Metadata.Annotations["mountPath"] == tsResizeCacheMountPath { + result = append(result, pvcItem{ + Name: item.Metadata.Name, + Annotations: item.Metadata.Annotations, + StorageSize: item.Spec.Resources.Requests.Storage, + }) + } + } + return result, nil +} + +// getBrokerPodUID returns the UID of the running (non-terminating) broker pod for the +// given broker ID, or an error if no such pod is found. +func getBrokerPodUID(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) (string, error) { + rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", "pod", + "-l", fmt.Sprintf("%s=%s,brokerId=%d,app=kafka", kafkaCRLabelKey, clusterName, brokerID), + "--output", "json", + ) + if err != nil { + return "", errors.WrapIf(err, "listing broker pods failed") + } + + var podList struct { + Items []struct { + Metadata struct { + UID string `json:"uid"` + DeletionTimestamp *string `json:"deletionTimestamp"` + } `json:"metadata"` + } `json:"items"` + } + if err := json.Unmarshal([]byte(rawOutput), &podList); err != nil { + return "", errors.WrapIf(err, "parsing pod list JSON failed") + } + + for _, item := range podList.Items { + if item.Metadata.DeletionTimestamp == nil { + return item.Metadata.UID, nil + } + } + return "", fmt.Errorf("no running broker pod found for broker %d in cluster %s", brokerID, clusterName) +} + +// testTieredStorageCachePvcResize tests the full multi-phase delete-and-recreate +// flow for a tiered storage cache PVC shrink. It: +// +// 1. Installs a minimal KRaft cluster with a 2Gi cache PVC on broker 0. +// 2. Applies an updated manifest that shrinks the cache to 1Gi. +// 3. Waits for Phase 1 (staging): both the old PVC (pending-deletion) and +// the new PVC (replacement) exist simultaneously. +// 4. Waits for Phase 2 (pod cycle): the pod restarts, the old PVC is deleted, +// and the new pod starts referencing the replacement PVC. +// 5. Waits for Phase 3 (completion): the replacement annotation is stripped and +// the broker pod is running again. +// 6. Verifies the surviving PVC carries the new 1Gi size. +// 7. Cleans up the cluster. +func testTieredStorageCachePvcResize() bool { + return ginkgo.When("Testing tiered storage cache PVC shrink (delete-and-recreate flow)", ginkgo.Ordered, func() { + var kubectlOptions k8s.KubectlOptions + var err error + var broker0PodUID string // UID of the broker-0 pod before the resize, used to detect recycling + + ginkgo.It("Acquiring K8s config and context", func() { + kubectlOptions, err = kubectlOptionsForCurrentContext() + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + kubectlOptions.Namespace = koperatorLocalHelmDescriptor.Namespace + }) + + // Pre-cleanup: forcibly remove any leftover cluster from a previous interrupted run + // so the install step always starts from a clean slate. + // We cannot rely on requireDeleteKafkaCluster here because a stuck finalizer (left by a + // mid-reconcile interruption) prevents the cascade deletion from completing in time. + // Instead we strip the finalizer first, delete the CR, then explicitly delete every + // owned resource so nothing blocks the subsequent fresh install. + ginkgo.It("Pre-cleanup: removing any leftover kafka-ts-resize cluster", func() { + // 1. Remove finalizers so the CR can be deleted regardless of operator state. + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "patch", kafkaKind, tsResizeClusterName, + "--type=merge", `--patch={"metadata":{"finalizers":[]}}`, + ) + // 2. Delete the CR itself (ignore not-found). + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "delete", kafkaKind, tsResizeClusterName, "--ignore-not-found", + ) + // 3. Explicitly delete PVCs — they carry a pvc-protection finalizer that + // blocks cascade GC while pods are still bound. + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "delete", "persistentvolumeclaims", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", + ) + // 4. Delete pods so they release PVC mounts promptly. + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "delete", "pods", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", "--grace-period=0", + ) + // 5. Wait for the CR itself to be gone (GC will handle the rest). + gomega.Eventually(context.Background(), func() error { + out, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", kafkaKind, tsResizeClusterName, + ) + if err != nil || strings.Contains(out, "NotFound") || strings.Contains(out, "not found") { + return nil + } + return errors.New("KafkaCluster CR still exists") + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Installing KafkaCluster with tiered storage cache PVC (2Gi)", func() { + ginkgo.By("Applying initial KafkaCluster manifest") + applyK8sResourceManifest(kubectlOptions, tsResizeInitialManifest) + + ginkgo.By("Waiting for all broker pods to be ready") + // kubectl wait --for=condition=Ready fails immediately when no pods exist yet. + // We don't wait for ClusterRunning — the cluster may stay in ClusterReconciling + // because GracefulDiskRebalanceRequired for log volumes needs CC, which this minimal + // test cluster does not deploy. Pod readiness is sufficient to proceed with the resize. + gomega.Eventually(context.Background(), func() error { + output, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", "pod", + "-l", fmt.Sprintf("%s=%s,app=kafka,isBrokerNode=true", kafkaCRLabelKey, tsResizeClusterName), + "--output", "json", + ) + if err != nil { + return errors.WrapIf(err, "listing pods failed") + } + var podList struct { + Items []struct { + Status struct { + Conditions []struct { + Type string `json:"type"` + Status string `json:"status"` + } `json:"conditions"` + } `json:"status"` + } `json:"items"` + } + if err := json.Unmarshal([]byte(output), &podList); err != nil { + return errors.WrapIf(err, "parsing pod list JSON failed") + } + if len(podList.Items) < 1 { + return fmt.Errorf("expected at least 1 broker pod, got %d", len(podList.Items)) + } + for _, pod := range podList.Items { + ready := false + for _, cond := range pod.Status.Conditions { + if cond.Type == "Ready" && cond.Status == "True" { + ready = true + break + } + } + if !ready { + return errors.New("not all pods are ready yet") + } + } + return nil + }, kafkaClusterCreateTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("Capturing broker-0 pod UID before resize") + broker0PodUID, err = getBrokerPodUID(kubectlOptions, tsResizeClusterName, 0) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(broker0PodUID).NotTo(gomega.BeEmpty()) + + ginkgo.By("Verifying initial cache PVC size is " + tsResizeInitialSize) + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvcs).To(gomega.HaveLen(1), "expected exactly one cache PVC for broker 0") + gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeInitialSize)) + }) + + ginkgo.It("Triggering cache PVC shrink from 2Gi to 1Gi", func() { + ginkgo.By("Applying shrunk KafkaCluster manifest") + applyK8sResourceManifest(kubectlOptions, tsResizeShrunkManifest) + }) + + ginkgo.It("Phase 1: old PVC annotated pending-deletion and replacement PVC created", func() { + ginkgo.By("Waiting until both pending-deletion and replacement PVCs coexist for broker 0") + gomega.Eventually(context.Background(), func() error { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + return err + } + var hasPendingDeletion, hasReplacement bool + for _, pvc := range pvcs { + switch pvc.Annotations[pvcResizeStateAnnotation] { + case pvcResizeStatePending: + hasPendingDeletion = true + case pvcResizeStateReplacement: + hasReplacement = true + } + } + if !hasPendingDeletion { + return errors.New("no PVC with pending-deletion annotation yet") + } + if !hasReplacement { + return errors.New("no PVC with replacement annotation yet") + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Phase 2: broker pod restarts, pending-deletion PVC is deleted", func() { + ginkgo.By("Waiting for broker-0 pod to be recycled (UID change indicates rolling restart)") + // We detect recycling by UID change rather than waiting for the pod count to hit zero. + // The pod may restart fast enough to be back before the next polling tick, causing a + // "pod count == 0" check to time out even when the restart already completed. + gomega.Eventually(context.Background(), func() error { + uid, err := getBrokerPodUID(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + // Pod may be absent mid-restart; treat as not-yet-recycled. + return err + } + if uid == broker0PodUID { + return errors.New("broker-0 pod has not been recycled yet (same UID)") + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + + ginkgo.By("Waiting for the pending-deletion PVC to be deleted") + gomega.Eventually(context.Background(), func() error { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + return err + } + for _, pvc := range pvcs { + if pvc.Annotations[pvcResizeStateAnnotation] == pvcResizeStatePending { + return fmt.Errorf("pending-deletion PVC %s still exists", pvc.Name) + } + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Phase 3: broker pod running again with new PVC, replacement annotation stripped", func() { + ginkgo.By("Waiting for the broker-0 pod to come back Ready") + err = waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", + tsResizePhaseTimeout, + fmt.Sprintf("%s=%s,brokerId=0,app=kafka", kafkaCRLabelKey, tsResizeClusterName), "") + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + ginkgo.By("Waiting for replacement annotation to be stripped from the surviving PVC") + gomega.Eventually(context.Background(), func() error { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + if err != nil { + return err + } + if len(pvcs) != 1 { + return fmt.Errorf("expected 1 cache PVC for broker 0, got %d", len(pvcs)) + } + if state := pvcs[0].Annotations[pvcResizeStateAnnotation]; state != "" { + return fmt.Errorf("cache-resize-state annotation still present: %q", state) + } + return nil + }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { + pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvcs).To(gomega.HaveLen(1)) + gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeShrunkSize), + "surviving PVC should carry the shrunk size") + gomega.Expect(strings.Contains(pvcs[0].Name, tsResizeClusterName)).To(gomega.BeTrue(), + "surviving PVC should belong to the test cluster") + }) + + // requireDeleteKafkaCluster receives a copy of kubectlOptions at registration + // time, so Namespace must be set here (not inside the It block above). + kubectlOptions.Namespace = koperatorLocalHelmDescriptor.Namespace + // requireDeleteKafkaCluster registers its own It block — must be called at + // the When scope, not nested inside an It. + requireDeleteKafkaCluster(kubectlOptions, tsResizeClusterName) + }) +} From 20cfb3ab8ebe1ad56c11bfa9448625f1ee9dc696 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 1 Apr 2026 15:01:34 +0300 Subject: [PATCH 03/21] fixed test --- .../samples/kafkacluster_tiered_storage.yaml | 147 ------------------ ...r_tiered_storage_cache_resize_initial.yaml | 25 +-- ...er_tiered_storage_cache_resize_shrunk.yaml | 25 +-- tests/e2e/koperator_suite_test.go | 1 + 4 files changed, 3 insertions(+), 195 deletions(-) delete mode 100644 config/samples/kafkacluster_tiered_storage.yaml diff --git a/config/samples/kafkacluster_tiered_storage.yaml b/config/samples/kafkacluster_tiered_storage.yaml deleted file mode 100644 index e79c5b72b..000000000 --- a/config/samples/kafkacluster_tiered_storage.yaml +++ /dev/null @@ -1,147 +0,0 @@ -apiVersion: kafka.banzaicloud.io/v1beta1 -kind: KafkaCluster -metadata: - labels: - controller-tools.k8s.io: "1.0" - name: kafka-tiered-storage -spec: - kRaft: true - headlessServiceEnabled: true - oneBrokerPerNode: false - clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" - - # Cluster-wide Kafka configuration including tiered storage settings - readOnlyConfig: | - auto.create.topics.enable=false - cruise.control.metrics.topic.auto.create=true - cruise.control.metrics.topic.num.partitions=1 - cruise.control.metrics.topic.replication.factor=2 - - # Tiered Storage Configuration - # Enable remote storage - remote.log.storage.system.enable=true - - # Remote storage manager class implementation - remote.log.storage.manager.class.name=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager - - # Remote log metadata manager class implementation - remote.log.metadata.manager.class.name=org.apache.kafka.server.log.remote.metadata.storage.TopicBasedRemoteLogMetadataManager - - # Remote storage manager configuration prefix - rsm.config.remote.storage.manager.impl.class=org.apache.kafka.server.log.remote.storage.RemoteLogStorageManager - - # Tiered storage cache configuration - this path matches the tieredStorageCache volume mount - rsm.config.fetch.chunk.cache.path=/tiered-storage-cache - - brokers: - - id: 0 - brokerConfig: - # Process roles for KRaft mode - processRoles: - - broker - - # Storage configurations - storageConfigs: - # Primary Kafka log storage - included in Cruise Control capacity - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - - # Tiered storage cache - excluded from Cruise Control capacity and log.dirs - - mountPath: "/tiered-storage-cache" - tieredStorageCache: true - pvcSpec: - accessModes: - - ReadWriteOnce - # Can use a different storage class optimized for cache (e.g., faster SSD) - # storageClassName: fast-ssd - resources: - requests: - storage: 50Gi - - resourceRequirements: - limits: - cpu: "2" - memory: 4Gi - requests: - cpu: "1" - memory: 2Gi - - - id: 1 - brokerConfig: - processRoles: - - broker - - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - - - mountPath: "/tiered-storage-cache" - tieredStorageCache: true - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - - resourceRequirements: - limits: - cpu: "2" - memory: 4Gi - requests: - cpu: "1" - memory: 2Gi - - - id: 2 - brokerConfig: - processRoles: - - broker - - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - - - mountPath: "/tiered-storage-cache" - tieredStorageCache: true - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - - resourceRequirements: - limits: - cpu: "2" - memory: 4Gi - requests: - cpu: "1" - memory: 2Gi - - listenersConfig: - internalListeners: - - type: "plaintext" - name: "internal" - containerPort: 29092 - usedForInnerBrokerCommunication: true - - type: "plaintext" - name: "controller" - containerPort: 29093 - usedForInnerBrokerCommunication: false - usedForControllerCommunication: true - diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml index d6cae7a43..b4df5bbd3 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml @@ -12,10 +12,7 @@ spec: auto.create.topics.enable=false rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: - topicConfig: - partitions: 1 - replicationFactor: 2 + cruiseControlConfig: {} brokers: - id: 0 brokerConfig: @@ -65,26 +62,6 @@ spec: requests: cpu: "100m" memory: 512Mi - - id: 2 - brokerConfig: - processRoles: - - broker - terminationGracePeriodSeconds: 10 - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 2Gi - resourceRequirements: - limits: - cpu: "500m" - memory: 1Gi - requests: - cpu: "100m" - memory: 512Mi listenersConfig: internalListeners: - type: "plaintext" diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml index 88a7fc27d..5d158905a 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml @@ -12,10 +12,7 @@ spec: auto.create.topics.enable=false rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: - topicConfig: - partitions: 1 - replicationFactor: 2 + cruiseControlConfig: {} brokers: - id: 0 brokerConfig: @@ -65,26 +62,6 @@ spec: requests: cpu: "100m" memory: 512Mi - - id: 2 - brokerConfig: - processRoles: - - broker - terminationGracePeriodSeconds: 10 - storageConfigs: - - mountPath: "/kafka-logs" - pvcSpec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 2Gi - resourceRequirements: - limits: - cpu: "500m" - memory: 1Gi - requests: - cpu: "100m" - memory: 512Mi listenersConfig: internalListeners: - type: "plaintext" diff --git a/tests/e2e/koperator_suite_test.go b/tests/e2e/koperator_suite_test.go index 3e2c35c3a..64c461e9c 100644 --- a/tests/e2e/koperator_suite_test.go +++ b/tests/e2e/koperator_suite_test.go @@ -78,6 +78,7 @@ var _ = ginkgo.When("Testing e2e test altogether", ginkgo.Ordered, func() { testProduceConsumeInternal() testJmxExporter() testUninstallKafkaCluster() + testTieredStorageCachePvcResize() testUninstall() snapshotClusterAndCompare(snapshottedInfo) }) From 6a9d4c213472f4b4b9256aecb98b947ee0014bfb Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 1 Apr 2026 16:22:00 +0300 Subject: [PATCH 04/21] fixed docs --- .../kafka.banzaicloud.io_kafkaclusters.yaml | 2 +- docs/tiered-storage-pvc-resize.md | 301 +++--------------- 2 files changed, 45 insertions(+), 258 deletions(-) diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index 901a688c8..4e09f60f5 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -5101,7 +5101,7 @@ spec: the PersistentVolume backing this claim. type: string type: object - tieredStorageCache: + tieredStorageCache: description: |- TieredStorageCache indicates this storage is used for Kafka tiered storage cache (e.g., for rsm.config.fetch.chunk.cache.path). When set to true, this storage will be diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md index 7336360f4..820f1e9a8 100644 --- a/docs/tiered-storage-pvc-resize.md +++ b/docs/tiered-storage-pvc-resize.md @@ -1,230 +1,55 @@ -# Tiered Storage Cache PVC Resize: Reconciliation Flow +# Tiered Storage Cache PVC Resize -This document explains how koperator handles resizing (shrinking) a tiered storage -cache PVC, why the flow is more involved than a regular PVC update, and the -design decisions behind the current implementation. +Kubernetes does not support shrinking a PVC in-place. Because tiered storage cache +data is ephemeral (repopulated from remote storage on broker restart), koperator +implements a **delete-and-recreate** strategy for cache PVC shrinks, coordinated +with the rolling upgrade machinery so only one broker is affected at a time. --- -## Background: why cache PVCs are different +## Annotations -Kubernetes does not allow shrinking a PVC — the only supported in-place change is -growing it. For regular Kafka log volumes koperator enforces the same constraint -(`isDesiredStorageValueInvalid` returns an error on any size decrease). - -Tiered storage cache volumes are different: the data they hold is ephemeral. When a -broker restarts it will repopulate the cache from remote storage. Losing the cache -does not cause data loss; it only causes a temporary increase in read latency. -Because of this koperator implements a **delete-and-recreate** strategy for cache -PVC shrinks — but it must do so safely, coordinating with the rolling upgrade -machinery so that only one broker is down at a time and cluster health gates are -respected. - ---- - -## Components involved - -| Component | File | Role | -|-----------|------|------| -| `reconcileKafkaPvc` | `pkg/resources/kafka/kafka.go` | Creates / updates / deletes PVCs | -| `reconcileKafkaPod` | `pkg/resources/kafka/kafka.go` | Creates pods or hands off to rolling upgrade | -| `handleRollingUpgrade` | `pkg/resources/kafka/kafka.go` | Deletes a pod after checking health gates | -| `getCreatedPvcForBroker` | `pkg/resources/kafka/kafka.go` | Looks up which PVCs exist for a broker and feeds them into the pod spec | -| `generateDataVolumeAndVolumeMount` | `pkg/resources/kafka/pod.go` | Translates PVC list → pod Volume + VolumeMount entries using `ClaimName` | - ---- - -## PVC annotations used during resize - -Two annotations are written directly on PVC objects to carry state through the -resize. Because annotations live on the Kubernetes object they survive reconciler -restarts, making every step re-entrant. +Two annotations are written on PVC objects to carry state across reconcile cycles. +They survive reconciler restarts, making every step re-entrant. | Annotation | Value | Written on | Meaning | |------------|-------|------------|---------| -| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | This PVC is being replaced; skip it in pod spec generation and delete it once the broker pod stops | -| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | This is the replacement PVC; defer to the rolling upgrade before treating it as normal | +| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | Being replaced; excluded from pod spec; deleted once broker pod stops | +| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | Replacement PVC; rolling upgrade must complete before annotations are stripped | | `koperator.adobe.com/replaces-pvc` | `` | New PVC | Traceability — records which PVC is being replaced | --- -## Reconciliation loop order - -Within a single `Reconcile()` call the kafka sub-reconciler runs in this order: - -``` -1. Services -2. PodDisruptionBudgets -3. reconcileKafkaPodDelete ← graceful downscale via Cruise Control -4. Listener statuses / PKI -5. reconcileKafkaPvc ← ALL PVC work happens here, before any pod work -6. Discover running pods + bound PVCs -7. reorderBrokers ← priority sort -8. FOR each broker: - a. ConfigMap - b. per-broker Service - c. reconcileKafkaPod ← pod create OR handleRollingUpgrade - d. reconcilePerBrokerDynamicConfig -9. reconcileClusterWideDynamicConfig -``` - -PVC reconciliation always runs before pod reconciliation. The new PVC is therefore -created (and starts provisioning) before `handleRollingUpgrade` is evaluated for -the same broker in the same cycle. - ---- - -## Full flow: shrinking a tiered storage cache PVC - -### Cycle N — resize detected (pod running) - -**`reconcileKafkaPvc` — per-broker setup** - -``` -r.List(brokerPodList) -brokerPodExists = true - -No pending-deletion PVCs exist yet → no cleanup -No replacement PVCs exist yet → no resize-complete strip -effectivePvcCount == len(desiredPvcs) → no CC disk removal triggered -``` - -**`reconcileKafkaPvc` — per-desired-PVC loop** +## Resize flow -``` -CheckIfObjectUpdated: currentSize > desiredSize → enters resize branch - - 1. Annotate old PVC: - koperator.adobe.com/cache-resize-state: pending-deletion - (r.Update — durable immediately in etcd) +### Cycle N — resize detected, pod running - 2. Create replacement PVC with desiredSize: - koperator.adobe.com/cache-resize-state: replacement - koperator.adobe.com/replaces-pvc: - (r.Create — provisioning starts now, in parallel with gate evaluation) +1. The old PVC is annotated `pending-deletion`. +2. A replacement PVC with the new (smaller) size is created and annotated `replacement`. Provisioning starts immediately. +3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart via `handleRollingUpgrade`. +4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate fails the state is preserved in PVC annotations and retried next cycle. - 3. Set broker ConfigurationState = ConfigOutOfSync - (triggers handleRollingUpgrade on every cycle until pod restarts) +### Cycle N+1 — pod is absent - continue -``` +A pod is considered absent when it either does not exist or has a non-nil `DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows cleanup to start during the pod's Terminating window rather than waiting for it to fully disappear from etcd. -**`reconcileKafkaPod` — same cycle** +1. The pending-deletion PVC is deleted. +2. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency. -`handleRollingUpgrade` sees `ConfigOutOfSync`, evaluates gates: +### Cycle N+2 — pod is present again -| Gate | Required to pass | -|------|-----------------| -| Pod count | All expected pods exist | -| Concurrent restart limit | Terminating/Pending pods < `ConcurrentBrokerRestartCountPerRack` | -| Rack awareness | Only pods from same AZ as restarting pods | -| Replica health | `offline + out-of-sync < FailureThreshold` | +The strip fires as soon as a non-Terminating pod exists for the broker and no pending-deletion PVC remains — the pod does not need to be fully Running. -If all pass → **broker pod is deleted** → requeue 15 s. -If any fail → requeue 15 s, try again next cycle (state is fully durable via PVC annotations). +1. No pending-deletion PVC remains and the replacement PVC exists → resize is complete. +2. The `cache-resize-state` and `replaces-pvc` annotations are stripped from the replacement PVC, which becomes an ordinary PVC from this point forward. --- -### Between cycles (rolling upgrade gates blocking) +## Grow vs shrink -Each reconcile cycle sees: +A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No annotations are written and no rolling restart is triggered. -``` -reconcileKafkaPvc: - brokerPodExists = true - No pending-deletion PVCs → no cleanup - Replacement PVC exists, no pending-deletion → resize-complete strip - … but pod is still up → strip does NOT fire (pod must be up AND no pending-deletion) - alreadyCreated loop: skips pending-deletion PVC, finds replacement PVC - replacement PVC guard: ensure ConfigOutOfSync, continue - -reconcileKafkaPod: - handleRollingUpgrade re-evaluates gates → requeue if blocked -``` - -State is preserved entirely in PVC annotations — no in-memory or status-field -bookkeeping required. Reconciler can crash and restart at any point. - ---- - -### Cycle N+1 — pod is gone - -**`reconcileKafkaPvc` — per-broker setup** - -``` -r.List(brokerPodList) -brokerPodExists = false - -Cleanup loop: finds PVC with pending-deletion annotation - → r.Delete(oldPvc) ← safe now, broker is not running - → r.List(pvcList) ← re-list; only replacement PVC remains -``` - -**`getCreatedPvcForBroker` (called later to build pod spec)** - -Filters out any PVC with `cache-resize-state: pending-deletion` before matching -mount paths. Returns the replacement PVC for the mount path. The pod spec is built -with `ClaimName` pointing to the new PVC. - -**`reconcileKafkaPod`** - -No pod exists → creates new pod referencing the replacement PVC. Kubernetes holds -the pod in `Pending` until the replacement PVC reaches `Bound` phase. - -Because provisioning started in cycle N (when the replacement PVC was created), -the PVC is likely already `Bound` by now, minimising pod startup latency. - ---- - -### Cycle N+2 — pod is running again - -**`reconcileKafkaPvc` — resize-complete strip** - -``` -brokerPodExists = true -No pending-deletion PVCs (deleted in N+1) -Replacement PVC exists → resize complete → strip annotations: - delete koperator.adobe.com/cache-resize-state - delete koperator.adobe.com/replaces-pvc - r.Update(replacementPvc) -``` - -The PVC is now an ordinary PVC. Subsequent reconciles treat it normally. - ---- - -## Sequence diagram - -``` -Cycle N (pod UP, resize detected) - reconcileKafkaPvc - ├─ r.Update(oldPvc) annotate pending-deletion - ├─ r.Create(newPvc) replacement PVC, provisioning starts - └─ ConfigOutOfSync set - reconcileKafkaPod - └─ handleRollingUpgrade - ├─ [gates fail] → requeue 15s - └─ [gates pass] → delete pod → requeue 15s - -Cycle N+k (pod UP, gates failing — any number of cycles) - reconcileKafkaPvc - └─ replacement PVC guard: ensure ConfigOutOfSync, continue - reconcileKafkaPod - └─ handleRollingUpgrade → requeue - -Cycle N+k+1 (pod GONE) - reconcileKafkaPvc - ├─ delete pending-deletion PVC - └─ re-list: only replacement PVC remains - reconcileKafkaPod - └─ no pod → create pod (ClaimName = replacement PVC) - └─ pod Pending until PVC bound (likely already bound) - -Cycle N+k+2 (pod RUNNING) - reconcileKafkaPvc - └─ no pending-deletion PVC + replacement PVC exists → strip annotations - └─ replacement PVC becomes ordinary PVC -``` +A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log volumes reject any size decrease. --- @@ -232,8 +57,7 @@ Cycle N+k+2 (pod RUNNING) | Property | Value | |----------|-------| -| State survives reconciler crash | Yes — PVC annotations are durable in etcd | -| Idempotent re-entry | Yes — each phase checks annotations and resumes correctly | +| State survives reconciler crash | Mostly — PVC annotations are durable in etcd; the one non-re-entrant window is between annotating the old PVC and creating the replacement, but `ConfigOutOfSync` set in that cycle persists in broker status so the rolling upgrade still proceeds | | Atomicity gap | Eliminated — new PVC is created before old is deleted | | Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 | | Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly | @@ -242,61 +66,24 @@ Cycle N+k+2 (pod RUNNING) --- -## Interaction with disk removal detection - -`reconcileKafkaPvc` detects regular disk removal by comparing PVC counts: +## Sequence diagram -```go -if effectivePvcCount > len(desiredPvcs) { handleDiskRemoval(...) } ``` +Cycle N (pod UP, resize detected) + ├─ annotate old PVC: pending-deletion + ├─ create replacement PVC (provisioning starts) + ├─ set ConfigOutOfSync + └─ handleRollingUpgrade + ├─ [gates fail] → requeue 15s, retry next cycle + └─ [gates pass] → delete pod → requeue 15s -The count uses `effectivePvcCount` which **excludes replacement PVCs**. During a -resize the old (pending-deletion) + new (replacement) PVCs temporarily co-exist for -the same mount path. Without this exclusion the count check would incorrectly -trigger a Cruise Control disk-removal operation. - -Inside `handleDiskRemoval` itself, the `pending-deletion` PVC is also safe: its -mount path still matches the desired spec, so `foundInDesired = true` and it is -skipped by the disk-removal state machine. - ---- - -## Interaction with GracefulDiskRebalanceRequired - -When a new PVC first becomes `Bound`, koperator normally sets -`GracefulDiskRebalanceRequired` to ask Cruise Control to rebalance data across -the broker's disks. This is correct for Kafka log volumes but wrong for tiered -storage cache volumes — CC must not account for ephemeral cache storage. - -The `alreadyCreated` loop now explicitly skips `GracefulDiskRebalanceRequired` for -any PVC annotated `tieredStorageCache: true`, including the replacement PVC. - ---- - -## Known limitations - -### ConfigOutOfSync still shared with config changes - -`ConfigOutOfSync` is set to trigger the rolling upgrade. It is the same bit used -for Kafka property changes. An observer cannot distinguish "resize pending" from -"config change" by status alone — the PVC annotations must be inspected. - -### Concurrent resize + complete storage-config removal - -If the tiered storage cache storage config is removed entirely from the spec while -a resize is in progress, both the `pending-deletion` and `replacement` PVCs have a -mount path that is no longer present in the desired spec. Without special handling -this would route them into the Cruise Control `remove_disks` path — which fails with -"log dir not found" because CC has no knowledge of ephemeral cache paths — leaving -the operator stuck in a 20-second requeue loop. - -koperator handles this as follows: +Cycle N+k (pod UP, gates failing — any number of cycles) + └─ ensure ConfigOutOfSync, requeue -**Pod DOWN**: both the `pending-deletion` and the orphaned `replacement` PVC are -deleted directly in `reconcileKafkaPvc` (bypassing Cruise Control). The operator -proceeds normally on the next cycle. +Cycle N+k+1 (pod ABSENT — gone or Terminating) + ├─ delete pending-deletion PVC + └─ create new pod bound to replacement PVC -**Pod UP**: koperator sets `ConfigOutOfSync` to trigger a rolling restart via -`handleRollingUpgrade`, then skips `handleDiskRemoval` for this broker entirely -(`continue` in the broker loop). Once the pod stops, the next cycle falls into the -Pod DOWN path above and cleans up both PVCs. +Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running) + └─ strip annotations → replacement PVC becomes ordinary PVC +``` From fbb97e4b8a0c86e88e2b4cff834ee79c2dd8220a Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 2 Apr 2026 11:43:32 +0300 Subject: [PATCH 05/21] lint errors --- pkg/resources/kafka/kafka.go | 557 ++++++++++-------- pkg/resources/kafka/kafka_test.go | 19 +- pkg/resources/kafka/pvc.go | 2 +- tests/e2e/test_tiered_storage_cache_resize.go | 22 +- 4 files changed, 333 insertions(+), 267 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 243ededf5..178ae5fc2 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -75,6 +75,9 @@ const ( // pvcCacheResizeReplacesPVC records the name of the old PVC that the replacement supersedes. pvcCacheResizeReplacesPVC = "koperator.adobe.com/replaces-pvc" + // annotationTrue is the string value used for boolean-true annotations and config comparisons. + annotationTrue = "true" + brokerConfigMapVolumeMount = "broker-config" kafkaDataVolumeMount = "kafka-data" @@ -1175,21 +1178,16 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro for brokerId, desiredPvcs := range brokersDesiredPvcs { desiredType := reflect.TypeOf(&corev1.PersistentVolumeClaim{}) brokerVolumesState := make(map[string]banzaiv1beta1.VolumeState) - pvcList := &corev1.PersistentVolumeClaimList{} - matchingLabels := client.MatchingLabels( apiutil.MergeLabels( apiutil.LabelsForKafka(r.KafkaCluster.Name), map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, ), ) - log = log.WithValues("kind", desiredType) - err := r.List(ctx, pvcList, - client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels) - if err != nil { + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) } @@ -1197,7 +1195,6 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro if err != nil { return errors.WrapIfWithDetails(err, "could not determine if broker is controller", "brokerId", brokerId) } - if isController { if len(desiredPvcs) != 1 { return errors.New("controller broker can have only one volume") @@ -1215,113 +1212,22 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro // Treating a Terminating pod as "not existing" lets the pending-deletion PVC be cleaned up // during the Terminating window rather than waiting for the narrow gap between the pod being // fully removed from etcd and the new pod being created in the same reconcile cycle. - brokerPodList := &corev1.PodList{} - brokerPodMatchingLabels := client.MatchingLabels( - apiutil.MergeLabels( - apiutil.LabelsForKafka(r.KafkaCluster.Name), - map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, - ), - ) - if err := r.List(ctx, brokerPodList, client.InNamespace(r.KafkaCluster.GetNamespace()), brokerPodMatchingLabels); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "getting broker pods failed", "brokerId", brokerId) - } - brokerPodExists := false - for i := range brokerPodList.Items { - if brokerPodList.Items[i].DeletionTimestamp == nil { - brokerPodExists = true - break - } + brokerPodExists, err := r.getBrokerPodExists(ctx, brokerId) + if err != nil { + return err } - // Build a set of desired mount paths for orphaned-PVC detection below. desiredMountPaths := make(map[string]bool, len(desiredPvcs)) for _, dp := range desiredPvcs { desiredMountPaths[dp.Annotations["mountPath"]] = true } - if brokerPodExists { - // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: - // strip the replacement annotation so the PVC is treated as normal going forward. - var hasPendingDeletion bool - for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - hasPendingDeletion = true - break - } - } - if !hasPendingDeletion { - for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { - pvcCopy := pvc.DeepCopy() - delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) - delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) - if err := r.Update(ctx, pvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) - } - log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) - } - } - } - } else { - // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) - } - } - } - // Also delete replacement PVCs whose mount path is no longer desired — these are - // orphaned by a storage config removal that happened while a cache resize was in flight. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) - } - } - } - // Re-list so the rest of this iteration sees the current state. - if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) - } + skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredMountPaths, brokerPodExists, matchingLabels) + if err != nil { + return err } - - // If the pod is still running and there are orphaned cache resize PVCs (storage config - // removed while a resize was in flight), trigger a rolling restart so the pod stops - // and we can delete the orphaned PVCs on the next cycle. Routing these through - // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. - if brokerPodExists { - var hasOrphanedCachePvc bool - for _, pvc := range pvcList.Items { - resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] - if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - hasOrphanedCachePvc = true - break - } - } - if hasOrphanedCachePvc { - log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", - "brokerId", brokerId) - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) - } - } - continue // Skip disk removal — wait for pod to stop before deleting cache PVCs. - } + if skipBroker { + continue } // Handle disk removal — count only non-replacement PVCs to avoid false positives while a @@ -1339,176 +1245,333 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro } } - for _, desiredPvc := range desiredPvcs { - currentPvc := desiredPvc.DeepCopy() - log.V(1).Info("searching with label because name is empty") + if err := r.reconcileDesiredPvcsForBroker(ctx, log, brokerId, desiredPvcs, pvcList, matchingLabels, desiredType, brokerVolumesState); err != nil { + return err + } - err := r.List(ctx, pvcList, - client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels) - if err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) - } + if len(brokerVolumesState) > 0 { + brokerIds = append(brokerIds, brokerId) + brokersVolumesState[brokerId] = brokerVolumesState + } + } - mountPath := currentPvc.Annotations[mountPathAnnotationKey] - // Creating the first PersistentVolume For Pod - if len(pvcList.Items) == 0 { - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - if err := r.Create(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) - } - log.Info("resource created") - continue - } + if len(brokersVolumesState) > 0 { + if err := k8sutil.UpdateBrokerStatus(r.Client, brokerIds, r.KafkaCluster, brokersVolumesState, log); err != nil { + return err + } + } + + if waitForDiskRemovalToFinish { + return errorfactory.New(errorfactory.CruiseControlTaskRunning{}, errors.New("Disk removal pending"), "Disk removal pending") + } + + return nil +} - alreadyCreated := false +// getBrokerPodExists returns true if a non-terminating pod exists for the given broker. +func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (bool, error) { + brokerPodList := &corev1.PodList{} + matchingLabels := client.MatchingLabels( + apiutil.MergeLabels( + apiutil.LabelsForKafka(r.KafkaCluster.Name), + map[string]string{banzaiv1beta1.BrokerIdLabelKey: brokerId}, + ), + ) + if err := r.List(ctx, brokerPodList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return false, errorfactory.New(errorfactory.APIFailure{}, err, "getting broker pods failed", "brokerId", brokerId) + } + for i := range brokerPodList.Items { + if brokerPodList.Items[i].DeletionTimestamp == nil { + return true, nil + } + } + return false, nil +} + +// handleBrokerCacheResizeCleanup manages tiered storage cache resize PVC cleanup based on broker pod state. +// It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned PVCs +// detected while the pod is running — a rolling restart must complete first). +// +//nolint:funlen +func (r *Reconciler) handleBrokerCacheResizeCleanup( + ctx context.Context, + log logr.Logger, + brokerId string, + pvcList *corev1.PersistentVolumeClaimList, + desiredMountPaths map[string]bool, + brokerPodExists bool, + matchingLabels client.MatchingLabels, +) (bool, error) { + desiredType := reflect.TypeOf(&corev1.PersistentVolumeClaim{}) + if brokerPodExists { + // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: + // strip the replacement annotation so the PVC is treated as normal going forward. + var hasPendingDeletion bool + for _, pvc := range pvcList.Items { + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + hasPendingDeletion = true + break + } + } + if !hasPendingDeletion { for _, pvc := range pvcList.Items { - if mountPath != pvc.Annotations[mountPathAnnotationKey] { - continue + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + pvcCopy := pvc.DeepCopy() + delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) + delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) + if err := r.Update(ctx, pvcCopy); err != nil { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) + } + log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) } - // Skip the old PVC that is waiting to be deleted once the broker pod stops. - // The replacement PVC (same mount path) will be matched on the next iteration. - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - continue + } + } + } else { + // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) } - currentPvc = pvc.DeepCopy() - alreadyCreated = true - // Trigger a CC disk rebalance only for regular data volumes. - // Tiered storage cache PVCs are ephemeral — CC must not account for them. - isCachePvc := pvc.Annotations["tieredStorageCache"] == "true" - if !isCachePvc { - // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, - // then we make it happening with status update. - // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance - volumeState, found := r.KafkaCluster.Status.BrokersState[brokerId].GracefulActionState.VolumeStates[mountPath] - if currentPvc.Status.Phase == corev1.ClaimBound && - (!found || volumeState.CruiseControlVolumeState.IsDiskRemoval()) { - brokerVolumesState[mountPath] = banzaiv1beta1.VolumeState{CruiseControlVolumeState: banzaiv1beta1.GracefulDiskRebalanceRequired} - } + } + } + // Also delete replacement PVCs whose mount path is no longer desired — these are + // orphaned by a storage config removal that happened while a cache resize was in flight. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) } - break } + } + // Re-list so the rest of this iteration sees the current state. + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return false, errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + } - if !alreadyCreated { - // Creating the 2+ PersistentVolumes for Pod - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - if err := r.Create(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) + // If the pod is still running and there are orphaned cache resize PVCs (storage config + // removed while a resize was in flight), trigger a rolling restart so the pod stops + // and we can delete the orphaned PVCs on the next cycle. Routing these through + // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. + if brokerPodExists { + var hasOrphanedCachePvc bool + for _, pvc := range pvcList.Items { + resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] + if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && + !desiredMountPaths[pvc.Annotations["mountPath"]] { + hasOrphanedCachePvc = true + break + } + } + if hasOrphanedCachePvc { + log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", + "brokerId", brokerId) + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) } + } + return true, nil // Skip disk removal — wait for pod to stop before deleting cache PVCs. + } + } + + return false, nil +} + +// reconcileDesiredPvcsForBroker reconciles the desired PVCs for a single broker. +// +//nolint:funlen +func (r *Reconciler) reconcileDesiredPvcsForBroker( + ctx context.Context, + log logr.Logger, + brokerId string, + desiredPvcs []*corev1.PersistentVolumeClaim, + pvcList *corev1.PersistentVolumeClaimList, + matchingLabels client.MatchingLabels, + desiredType reflect.Type, + brokerVolumesState map[string]banzaiv1beta1.VolumeState, +) error { + for _, desiredPvc := range desiredPvcs { + currentPvc := desiredPvc.DeepCopy() + log.V(1).Info("searching with label because name is empty") + + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + + mountPath := currentPvc.Annotations[mountPathAnnotationKey] + // Creating the first PersistentVolume For Pod + if len(pvcList.Items) == 0 { + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) + } + log.Info("resource created") + continue + } + + alreadyCreated := false + for _, pvc := range pvcList.Items { + if mountPath != pvc.Annotations[mountPathAnnotationKey] { continue } - if err == nil { - // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. - // It already has the right size; we only need to keep the rolling upgrade running so - // the broker pod stops and the pending-deletion PVC can be cleaned up. - if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for pending tiered storage cache PVC resize", "brokerId", brokerId) - } - } - continue + // Skip the old PVC that is waiting to be deleted once the broker pod stops. + // The replacement PVC (same mount path) will be matched on the next iteration. + if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + continue + } + currentPvc = pvc.DeepCopy() + alreadyCreated = true + // Trigger a CC disk rebalance only for regular data volumes. + // Tiered storage cache PVCs are ephemeral — CC must not account for them. + if pvc.Annotations["tieredStorageCache"] != annotationTrue { + // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, + // then we make it happening with status update. + // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance + volumeState, found := r.KafkaCluster.Status.BrokersState[brokerId].GracefulActionState.VolumeStates[mountPath] + if currentPvc.Status.Phase == corev1.ClaimBound && + (!found || volumeState.CruiseControlVolumeState.IsDiskRemoval()) { + brokerVolumesState[mountPath] = banzaiv1beta1.VolumeState{CruiseControlVolumeState: banzaiv1beta1.GracefulDiskRebalanceRequired} } + } + break + } - if k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } + if !alreadyCreated { + // Creating the 2+ PersistentVolumes for Pod + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) + } + continue + } - // Check if this is a tiered storage cache volume - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == "true" - desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() - currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() - - // Tiered storage cache PVC shrink: stage the replacement PVC immediately so - // provisioning runs in parallel with rolling-upgrade gate evaluation. - // The old PVC is annotated pending-deletion and removed once the broker pod stops. - if isTieredCache && desiredSize < currentSize { - log.Info("Tiered storage cache size decrease detected — staging replacement PVC", - "brokerId", brokerId, - "mountPath", mountPath, - "currentSize", currentSize, - "desiredSize", desiredSize) - - // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up - // the moment the broker pod stops. - oldPvcCopy := currentPvc.DeepCopy() - oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - if err := r.Update(ctx, oldPvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) - } - log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. + // It already has the right size; we only need to keep the rolling upgrade running so + // the broker pod stops and the pending-deletion PVC can be cleaned up. + if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for pending tiered storage cache PVC resize", "brokerId", brokerId) + } + } + continue + } - // Create the replacement PVC immediately so provisioning starts now. - desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement - desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - if err := r.Create(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) - } - log.Info("Created replacement tiered storage cache PVC", - "brokerId", brokerId, "pvc", desiredPvc.Name, "desiredSize", desiredSize) - - // Trigger rolling upgrade so the broker pod is restarted and the old PVC - // can be deleted on the next reconcile cycle. - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - log.Info("Marking broker ConfigOutOfSync to trigger rolling upgrade for tiered storage cache PVC resize", - "brokerId", brokerId) - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for tiered storage cache PVC resize", "brokerId", brokerId) - } - } - continue - } + if !k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { + continue + } - // Regular validation: size decreases are forbidden for non-cache volumes. - if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { - return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), - "one can not reduce the size of a PVC", "kind", desiredType) - } + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } - resReq := desiredPvc.Spec.Resources.Requests - labels := desiredPvc.Labels - desiredPvc = currentPvc.DeepCopy() - desiredPvc.Spec.Resources.Requests = resReq - desiredPvc.Labels = labels + // Check if this is a tiered storage cache volume + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue + desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() - if err := r.Update(ctx, desiredPvc); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, "updating resource failed", "kind", desiredType) - } - log.Info("resource updated") - } + // Tiered storage cache PVC shrink: stage the replacement PVC immediately so + // provisioning runs in parallel with rolling-upgrade gate evaluation. + // The old PVC is annotated pending-deletion and removed once the broker pod stops. + if isTieredCache && desiredSize < currentSize { + if err := r.stageTieredCachePVCShrink(ctx, log, brokerId, mountPath, currentPvc, desiredPvc, currentSize, desiredSize); err != nil { + return err } + continue } - if len(brokerVolumesState) > 0 { - brokerIds = append(brokerIds, brokerId) - brokersVolumesState[brokerId] = brokerVolumesState + // Regular validation: size decreases are forbidden for non-cache volumes. + if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { + return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), + "one can not reduce the size of a PVC", "kind", desiredType) } - } - if len(brokersVolumesState) > 0 { - err := k8sutil.UpdateBrokerStatus(r.Client, brokerIds, r.KafkaCluster, brokersVolumesState, log) - if err != nil { - return err + resReq := desiredPvc.Spec.Resources.Requests + labels := desiredPvc.Labels + desiredPvc = currentPvc.DeepCopy() + desiredPvc.Spec.Resources.Requests = resReq + desiredPvc.Labels = labels + + if err := r.Update(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "updating resource failed", "kind", desiredType) } + log.Info("resource updated") } + return nil +} - if waitForDiskRemovalToFinish { - return errorfactory.New(errorfactory.CruiseControlTaskRunning{}, errors.New("Disk removal pending"), "Disk removal pending") +// stageTieredCachePVCShrink handles shrinking a tiered storage cache PVC by staging a replacement +// and triggering a rolling restart so the old PVC can be deleted once the broker pod stops. +func (r *Reconciler) stageTieredCachePVCShrink( + ctx context.Context, + log logr.Logger, + brokerId string, + mountPath string, + currentPvc *corev1.PersistentVolumeClaim, + desiredPvc *corev1.PersistentVolumeClaim, + currentSize int64, + desiredSize int64, +) error { + log.Info("Tiered storage cache size decrease detected — staging replacement PVC", + "brokerId", brokerId, + "mountPath", mountPath, + "currentSize", currentSize, + "desiredSize", desiredSize) + + // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up + // the moment the broker pod stops. + oldPvcCopy := currentPvc.DeepCopy() + oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion + if err := r.Update(ctx, oldPvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) + } + log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + + // Create the replacement PVC immediately so provisioning starts now. + desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement + desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + if err := r.Create(ctx, desiredPvc); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) } + log.Info("Created replacement tiered storage cache PVC", + "brokerId", brokerId, "pvc", desiredPvc.Name, "desiredSize", desiredSize) + // Trigger rolling upgrade so the broker pod is restarted and the old PVC + // can be deleted on the next reconcile cycle. + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + log.Info("Marking broker ConfigOutOfSync to trigger rolling upgrade for tiered storage cache PVC resize", + "brokerId", brokerId) + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for tiered storage cache PVC resize", "brokerId", brokerId) + } + } return nil } diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 33093680a..64082f443 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -31,11 +31,12 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "github.com/banzaicloud/k8s-objectmatcher/patch" + "github.com/banzaicloud/koperator/api/v1alpha1" "github.com/banzaicloud/koperator/api/v1beta1" controllerMocks "github.com/banzaicloud/koperator/controllers/tests/mocks" @@ -1402,15 +1403,15 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } testCases := []struct { - testName string - existingPvc *corev1.PersistentVolumeClaim - desiredPvc *corev1.PersistentVolumeClaim - existingPods []corev1.Pod + testName string + existingPvc *corev1.PersistentVolumeClaim + desiredPvc *corev1.PersistentVolumeClaim + existingPods []corev1.Pod // expectedUpdatePvc: old PVC annotated pending-deletion, or annotation stripped on resize-complete - expectedUpdatePvc bool - expectedCreatePvc bool - expectedDeletePvc bool - expectedError bool + expectedUpdatePvc bool + expectedCreatePvc bool + expectedDeletePvc bool + expectedError bool }{ { // Pod is up: annotate old PVC as pending-deletion, create replacement PVC, diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index f18e4febe..fcd80b101 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -66,7 +66,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora // Mark tiered storage cache PVCs with annotation for special handling if storage.TieredStorageCache { - annotations["tieredStorageCache"] = "true" + annotations["tieredStorageCache"] = annotationTrue } return &corev1.PersistentVolumeClaim{ diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 0decedbc5..b45ffdb04 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -38,12 +38,14 @@ const ( tsResizeShrunkSize = "1Gi" // Annotation keys written by the cache-resize reconciler. - pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" - pvcResizeStatePending = "pending-deletion" + pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" + pvcResizeStatePending = "pending-deletion" pvcResizeStateReplacement = "replacement" tsResizePhaseTimeout = 10 * time.Minute tsResizePollingInterval = 15 * time.Second + + tsResizeBrokerID = 0 ) // pvcItem is a minimal representation of a PVC for assertion helpers. @@ -53,10 +55,10 @@ type pvcItem struct { StorageSize string } -// listBrokerCachePVCs returns PVCs for the given broker that have the +// listBrokerCachePVCs returns PVCs for broker tsResizeBrokerID that have the // tiered-storage-cache mount path annotation. -func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) ([]pvcItem, error) { - selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, clusterName, brokerID) +func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { + selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, tsResizeClusterName, tsResizeBrokerID) rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, "get", "persistentvolumeclaims", @@ -252,7 +254,7 @@ func testTieredStorageCachePvcResize() bool { gomega.Expect(broker0PodUID).NotTo(gomega.BeEmpty()) ginkgo.By("Verifying initial cache PVC size is " + tsResizeInitialSize) - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(pvcs).To(gomega.HaveLen(1), "expected exactly one cache PVC for broker 0") gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeInitialSize)) @@ -266,7 +268,7 @@ func testTieredStorageCachePvcResize() bool { ginkgo.It("Phase 1: old PVC annotated pending-deletion and replacement PVC created", func() { ginkgo.By("Waiting until both pending-deletion and replacement PVCs coexist for broker 0") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } @@ -308,7 +310,7 @@ func testTieredStorageCachePvcResize() bool { ginkgo.By("Waiting for the pending-deletion PVC to be deleted") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } @@ -330,7 +332,7 @@ func testTieredStorageCachePvcResize() bool { ginkgo.By("Waiting for replacement annotation to be stripped from the surviving PVC") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } @@ -345,7 +347,7 @@ func testTieredStorageCachePvcResize() bool { }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { - pvcs, err := listBrokerCachePVCs(kubectlOptions, tsResizeClusterName, 0) + pvcs, err := listBrokerCachePVCs(kubectlOptions) gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(pvcs).To(gomega.HaveLen(1)) gomega.Expect(pvcs[0].StorageSize).To(gomega.Equal(tsResizeShrunkSize), From 7884065d7ba1b91c1db22ccb4449e2c8d2291c6c Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 2 Apr 2026 15:59:38 +0300 Subject: [PATCH 06/21] add annotation during reconcile --- pkg/resources/kafka/kafka.go | 47 +++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 178ae5fc2..8c0633feb 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1230,6 +1230,27 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } + // Delete tiered storage cache PVCs whose mount path is no longer desired. + // These are not Kafka log dirs so they must never go through CC disk removal. + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + if !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Deleting removed tiered storage cache PVC", + "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + // Re-list so the disk removal count below reflects the deletions above. + if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) + } + // Handle disk removal — count only non-replacement PVCs to avoid false positives while a // tiered storage cache resize is in flight (old + new PVC temporarily coexist). effectivePvcCount := 0 @@ -1437,9 +1458,27 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( } currentPvc = pvc.DeepCopy() alreadyCreated = true + + // Backfill the tieredStorageCache annotation if the desired PVC has it but the existing + // PVC does not — handles PVCs created before the annotation was introduced. + if desiredPvc.Annotations["tieredStorageCache"] == annotationTrue && + currentPvc.Annotations["tieredStorageCache"] != annotationTrue { + pvcCopy := currentPvc.DeepCopy() + if pvcCopy.Annotations == nil { + pvcCopy.Annotations = make(map[string]string) + } + pvcCopy.Annotations["tieredStorageCache"] = annotationTrue + if err := r.Update(ctx, pvcCopy); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, + "backfilling tieredStorageCache annotation on existing PVC failed", "pvc", pvcCopy.Name) + } + log.Info("Backfilled tieredStorageCache annotation on existing PVC", "pvc", pvcCopy.Name) + currentPvc = pvcCopy + } + // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. - if pvc.Annotations["tieredStorageCache"] != annotationTrue { + if currentPvc.Annotations["tieredStorageCache"] != annotationTrue { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1485,8 +1524,10 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( return errors.WrapIf(err, "could not apply last state to annotation") } - // Check if this is a tiered storage cache volume - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue + // Check if this is a tiered storage cache volume. Fall back to the desired PVC annotation + // for PVCs created before the tieredStorageCache annotation was introduced. + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue || + desiredPvc.Annotations["tieredStorageCache"] == annotationTrue desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() From 7f72ee817019bd73f7bc31caa16f700c428f2102 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Tue, 7 Apr 2026 14:13:14 +0300 Subject: [PATCH 07/21] pr review --- api/v1beta1/common_types.go | 13 + charts/kafka-operator/crds/kafkaclusters.yaml | 10 + .../kafka.banzaicloud.io_kafkaclusters.yaml | 10 + docs/tiered-storage-pvc-resize.md | 86 +++-- pkg/k8sutil/status.go | 11 + pkg/resources/kafka/configmap.go | 4 + pkg/resources/kafka/kafka.go | 318 ++++++++++++------ pkg/resources/kafka/kafka_test.go | 119 +++---- tests/e2e/test_tiered_storage_cache_resize.go | 116 ++++--- 9 files changed, 461 insertions(+), 226 deletions(-) diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index df41632d0..c7466a66f 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -216,6 +216,16 @@ type VolumeState struct { CruiseControlOperationReference *corev1.LocalObjectReference `json:"cruiseControlOperationReference,omitempty"` } +// CacheResizeState tracks the resize lifecycle of a tiered storage cache PVC for a given mount path. +type CacheResizeState string + +const ( + // CacheResizePendingDeletion indicates that the old cache PVC at this mount path is waiting + // to be deleted once the broker pod stops. A replacement PVC with the new desired size has + // already been created at the same mount path. + CacheResizePendingDeletion CacheResizeState = "pending-deletion" +) + // BrokerState holds information about broker state type BrokerState struct { // RackAwarenessState holds info about rack awareness status @@ -234,6 +244,9 @@ type BrokerState struct { Image string `json:"image,omitempty"` // Compressed data from broker configuration to restore broker pod in specific cases ConfigurationBackup string `json:"configurationBackup,omitempty"` + // CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. + // An entry is present only while a resize is in progress; it is cleared once cleanup completes. + CacheVolumeStates map[string]CacheResizeState `json:"cacheVolumeStates,omitempty"` } const ( diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index 4e09f60f5..273ec3028 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -23092,6 +23092,16 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: + cacheVolumeStates: + additionalProperties: + description: CacheResizeState tracks the resize lifecycle of + a tiered storage cache PVC for a given mount path. + type: string + description: CacheVolumeStates tracks in-flight tiered storage + cache PVC resize operations, keyed by mount path. An entry + is present only while a resize is in progress; it is cleared + once cleanup completes. + type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index 4e09f60f5..273ec3028 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -23092,6 +23092,16 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: + cacheVolumeStates: + additionalProperties: + description: CacheResizeState tracks the resize lifecycle of + a tiered storage cache PVC for a given mount path. + type: string + description: CacheVolumeStates tracks in-flight tiered storage + cache PVC resize operations, keyed by mount path. An entry + is present only while a resize is in progress; it is cleared + once cleanup completes. + type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md index 820f1e9a8..dd21d7a1d 100644 --- a/docs/tiered-storage-pvc-resize.md +++ b/docs/tiered-storage-pvc-resize.md @@ -7,16 +7,27 @@ with the rolling upgrade machinery so only one broker is affected at a time. --- -## Annotations +## State tracking -Two annotations are written on PVC objects to carry state across reconcile cycles. -They survive reconciler restarts, making every step re-entrant. +Resize state is stored in the `KafkaCluster` CR status under +`status.brokersState[].cacheVolumeStates`, keyed by mount path. +This keeps the KafkaCluster CR the single source of truth for all in-flight +broker operations and avoids a second, parallel state store on PVC objects. -| Annotation | Value | Written on | Meaning | -|------------|-------|------------|---------| -| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | Being replaced; excluded from pod spec; deleted once broker pod stops | -| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | Replacement PVC; rolling upgrade must complete before annotations are stripped | -| `koperator.adobe.com/replaces-pvc` | `` | New PVC | Traceability — records which PVC is being replaced | +| Field | Value | Meaning | +|-------|-------|---------| +| `status.brokersState[N].cacheVolumeStates[]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. | + +The entry is cleared once the old PVC has been deleted and the broker pod has +restarted. An empty map means no resize is in progress. + +Two PVC annotations that describe what a PVC **is** (not operational state) are +always present on cache PVCs: + +| Annotation | Value | Purpose | +|------------|-------|---------| +| `mountPath` | `` | Used throughout reconcile logic to match PVCs to storage configs | +| `tieredStorageCache` | `"true"` | Identifies cache PVCs for special handling: skipped from `log.dirs` and CC capacity config | --- @@ -24,32 +35,45 @@ They survive reconciler restarts, making every step re-entrant. ### Cycle N — resize detected, pod running -1. The old PVC is annotated `pending-deletion`. -2. A replacement PVC with the new (smaller) size is created and annotated `replacement`. Provisioning starts immediately. -3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart via `handleRollingUpgrade`. -4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate fails the state is preserved in PVC annotations and retried next cycle. +1. `status.brokersState[N].cacheVolumeStates[]` is set to `pending-deletion` + in the KafkaCluster CR status. This is the durable record that a resize is in flight. +2. A replacement PVC with the new (smaller) size is created. Provisioning starts immediately. +3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart + via `handleRollingUpgrade`. +4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, + rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate + fails the state persists in the CR and is retried next cycle. ### Cycle N+1 — pod is absent -A pod is considered absent when it either does not exist or has a non-nil `DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows cleanup to start during the pod's Terminating window rather than waiting for it to fully disappear from etcd. +A pod is considered absent when it either does not exist or has a non-nil +`DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows +cleanup to start during the pod's Terminating window rather than waiting for it +to fully disappear from etcd. -1. The pending-deletion PVC is deleted. -2. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency. +1. The old PVC (the one whose size differs from the desired size at that mount path) + is deleted. +2. The `cacheVolumeStates` entry for that mount path is cleared from the CR status. +3. A new broker pod is created referencing the replacement PVC. Because provisioning + started in cycle N the PVC is likely already `Bound`, minimising startup latency. ### Cycle N+2 — pod is present again -The strip fires as soon as a non-Terminating pod exists for the broker and no pending-deletion PVC remains — the pod does not need to be fully Running. - -1. No pending-deletion PVC remains and the replacement PVC exists → resize is complete. -2. The `cache-resize-state` and `replaces-pvc` annotations are stripped from the replacement PVC, which becomes an ordinary PVC from this point forward. +1. No `cacheVolumeStates` entry remains for the mount path → resize is complete. +2. The replacement PVC is now an ordinary cache PVC with no special state attached. --- ## Grow vs shrink -A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No annotations are written and no rolling restart is triggered. +A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC +spec is updated with the larger size and Kubernetes expands the volume without a +pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No +`cacheVolumeStates` entry is written and no rolling restart is triggered. -A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log volumes reject any size decrease. +A cache PVC **shrink** uses the delete-and-recreate flow described above. +Shrinking is only supported for tiered storage cache volumes — regular Kafka log +volumes reject any size decrease with an error. --- @@ -57,12 +81,13 @@ A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrink | Property | Value | |----------|-------| -| State survives reconciler crash | Mostly — PVC annotations are durable in etcd; the one non-re-entrant window is between annotating the old PVC and creating the replacement, but `ConfigOutOfSync` set in that cycle persists in broker status so the rolling upgrade still proceeds | -| Atomicity gap | Eliminated — new PVC is created before old is deleted | -| Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 | -| Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly | -| ConfigOutOfSync overloading | Reduced — `ConfigOutOfSync` still used, but the *reason* is legible in PVC annotations | -| CC disk rebalance for cache PVCs | Fixed — tiered cache PVCs are explicitly excluded from `GracefulDiskRebalanceRequired` logic | +| State survives reconciler crash | Yes — `cacheVolumeStates` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant | +| Single source of truth | Yes — all broker state (configuration, graceful actions, cache resize) lives in `status.brokersState` | +| Atomicity gap | Eliminated — replacement PVC is created before old is deleted | +| Provisioning overlaps gate evaluation | Yes — replacement PVC created in cycle N, not N+1 | +| Observable via kubectl | Yes — `kubectl get kafkacluster -o jsonpath='{.status.brokersState}'` shows resize state; an empty `cacheVolumeStates` means no resize is in progress | +| CC disk rebalance for cache PVCs | Excluded — tiered cache PVCs are explicitly skipped from `GracefulDiskRebalanceRequired` and CC capacity config | +| `log.dirs` for cache PVCs | Excluded — `generateStorageConfig` skips volumes with `TieredStorageCache: true` | --- @@ -70,7 +95,7 @@ A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrink ``` Cycle N (pod UP, resize detected) - ├─ annotate old PVC: pending-deletion + ├─ set cacheVolumeStates[mountPath] = pending-deletion in CR status ├─ create replacement PVC (provisioning starts) ├─ set ConfigOutOfSync └─ handleRollingUpgrade @@ -81,9 +106,10 @@ Cycle N+k (pod UP, gates failing — any number of cycles) └─ ensure ConfigOutOfSync, requeue Cycle N+k+1 (pod ABSENT — gone or Terminating) - ├─ delete pending-deletion PVC + ├─ delete old PVC (identified as the PVC at mountPath whose size ≠ desired) + ├─ clear cacheVolumeStates[mountPath] from CR status └─ create new pod bound to replacement PVC Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running) - └─ strip annotations → replacement PVC becomes ordinary PVC + └─ cacheVolumeStates entry is absent → resize complete, no further action ``` diff --git a/pkg/k8sutil/status.go b/pkg/k8sutil/status.go index 4836cb3ab..e48a305ae 100644 --- a/pkg/k8sutil/status.go +++ b/pkg/k8sutil/status.go @@ -174,6 +174,17 @@ func generateBrokerState(brokerIDs []string, cluster *banzaicloudv1beta1.KafkaCl case banzaicloudv1beta1.KafkaVersion: brokerState.Image = s.Image brokerState.Version = s.Version + case map[string]banzaicloudv1beta1.CacheResizeState: + if brokerState.CacheVolumeStates == nil { + brokerState.CacheVolumeStates = make(map[string]banzaicloudv1beta1.CacheResizeState) + } + for mountPath, state := range s { + if state == "" { + delete(brokerState.CacheVolumeStates, mountPath) + } else { + brokerState.CacheVolumeStates[mountPath] = state + } + } } brokersState[brokerID] = brokerState } diff --git a/pkg/resources/kafka/configmap.go b/pkg/resources/kafka/configmap.go index 9fc9d8b09..9ac0b32a7 100644 --- a/pkg/resources/kafka/configmap.go +++ b/pkg/resources/kafka/configmap.go @@ -411,6 +411,10 @@ func getMountPathsFromBrokerConfigMap(configMap *corev1.ConfigMap) ([]string, er func generateStorageConfig(sConfig []v1beta1.StorageConfig) []string { mountPaths := make([]string, 0, len(sConfig)) for _, storage := range sConfig { + // Tiered storage cache volumes are not Kafka log dirs — exclude them from log.dirs. + if storage.TieredStorageCache { + continue + } mountPaths = append(mountPaths, util.StorageConfigKafkaMountPath(storage.MountPath)) } return mountPaths diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 8c0633feb..5b9684393 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -67,14 +67,6 @@ const ( brokerConfigTemplate = "%s-config" brokerStorageTemplate = "%s-%d-storage-%d-" - // pvcCacheResizeStateAnnotation is set on PVCs during a tiered storage cache resize to track - // which PVC is the old one awaiting deletion and which is the new replacement. - pvcCacheResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" - pvcCacheResizePendingDeletion = "pending-deletion" - pvcCacheResizeReplacement = "replacement" - // pvcCacheResizeReplacesPVC records the name of the old PVC that the replacement supersedes. - pvcCacheResizeReplacesPVC = "koperator.adobe.com/replaces-pvc" - // annotationTrue is the string value used for boolean-true annotations and config comparisons. annotationTrue = "true" @@ -156,7 +148,9 @@ func getCreatedPvcForBroker( c client.Reader, brokerID int32, storageConfigs []banzaiv1beta1.StorageConfig, - namespace, crName string) ([]corev1.PersistentVolumeClaim, error) { + namespace, crName string, + pendingDeletionMountPaths map[string]bool, +) ([]corev1.PersistentVolumeClaim, error) { foundPvcList := &corev1.PersistentVolumeClaimList{} matchingLabels := client.MatchingLabels( apiutil.MergeLabels( @@ -169,14 +163,27 @@ func getCreatedPvcForBroker( return nil, err } - // Exclude PVCs that are pending deletion during a tiered storage cache resize. - // The replacement PVC (same mount path, smaller size) is already present and will be used instead. + // Filter the PVC list: + // 1. Always exclude terminating PVCs (DeletionTimestamp set) — they are released from pods + // and should not be mounted by the new broker pod. + // 2. For mount paths with an in-flight cache resize (pendingDeletionMountPaths), two PVCs + // temporarily coexist (old + replacement). Keep only one so the pod spec has a unique + // mount path (the reconciler will use the correctly-sized one during PVC reconciliation). + seenMountPaths := make(map[string]bool) n := 0 for _, pvc := range foundPvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizePendingDeletion { - foundPvcList.Items[n] = pvc - n++ + if pvc.DeletionTimestamp != nil { + continue + } + mp := pvc.Annotations["mountPath"] + if pendingDeletionMountPaths[mp] { + if seenMountPaths[mp] { + continue + } + seenMountPaths[mp] = true } + foundPvcList.Items[n] = pvc + n++ } foundPvcList.Items = foundPvcList.Items[:n] @@ -465,7 +472,14 @@ func (r *Reconciler) Reconcile(log logr.Logger) error { } } - pvcs, err := getCreatedPvcForBroker(ctx, r.Client, broker.Id, brokerConfig.StorageConfigs, r.KafkaCluster.Namespace, r.KafkaCluster.Name) + pendingDeletionMountPaths := make(map[string]bool) + brokerIdStr := strconv.Itoa(int(broker.Id)) + for mp, state := range r.KafkaCluster.Status.BrokersState[brokerIdStr].CacheVolumeStates { + if state == banzaiv1beta1.CacheResizePendingDeletion { + pendingDeletionMountPaths[mp] = true + } + } + pvcs, err := getCreatedPvcForBroker(ctx, r.Client, broker.Id, brokerConfig.StorageConfigs, r.KafkaCluster.Namespace, r.KafkaCluster.Name, pendingDeletionMountPaths) if err != nil { return errors.WrapIfWithDetails(err, "failed to list PVC's") } @@ -1222,7 +1236,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro desiredMountPaths[dp.Annotations["mountPath"]] = true } - skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredMountPaths, brokerPodExists, matchingLabels) + skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredPvcs, desiredMountPaths, brokerPodExists, matchingLabels) if err != nil { return err } @@ -1234,7 +1248,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro // These are not Kafka log dirs so they must never go through CC disk removal. for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.Annotations["tieredStorageCache"] != annotationTrue { + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { continue } if !desiredMountPaths[pvc.Annotations["mountPath"]] { @@ -1246,16 +1260,66 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro } } } + + // Delete duplicate non-terminating cache PVCs at desired mount paths when no resize is in + // flight. These are orphaned replacements from a double-staging race (two reconcile cycles + // both staged a replacement before the first deletion propagated). Keep only the Bound one; + // delete any Pending duplicates. Safe only when the broker pod is down. + if !brokerPodExists { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + boundByMountPath := make(map[string]bool) + for _, pvc := range pvcList.Items { + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if pvc.Status.Phase == corev1.ClaimBound { + boundByMountPath[mp] = true + } + } + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + // Delete a Pending duplicate only when a Bound PVC already exists at this path. + if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { + log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) + } + } + } + } + // Re-list so the disk removal count below reflects the deletions above. if err := r.List(ctx, pvcList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) } - // Handle disk removal — count only non-replacement PVCs to avoid false positives while a - // tiered storage cache resize is in flight (old + new PVC temporarily coexist). + // Handle disk removal — for mount paths with an in-flight cache resize two PVCs temporarily + // coexist (old pending-deletion + new replacement). Count each such path only once to avoid + // a false-positive disk-removal trigger. + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + countedCacheMountPaths := make(map[string]bool) effectivePvcCount := 0 for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] != pvcCacheResizeReplacement { + mp := pvc.Annotations["mountPath"] + if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + if !countedCacheMountPaths[mp] { + countedCacheMountPaths[mp] = true + effectivePvcCount++ + } + } else { effectivePvcCount++ } } @@ -1310,8 +1374,8 @@ func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (b } // handleBrokerCacheResizeCleanup manages tiered storage cache resize PVC cleanup based on broker pod state. -// It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned PVCs -// detected while the pod is running — a rolling restart must complete first). +// It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned resize +// state detected while the pod is running — a rolling restart must complete first). // //nolint:funlen func (r *Reconciler) handleBrokerCacheResizeCleanup( @@ -1319,60 +1383,109 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( log logr.Logger, brokerId string, pvcList *corev1.PersistentVolumeClaimList, + desiredPvcs []*corev1.PersistentVolumeClaim, desiredMountPaths map[string]bool, brokerPodExists bool, matchingLabels client.MatchingLabels, ) (bool, error) { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + if len(cacheVolumeStates) == 0 { + return false, nil + } + desiredType := reflect.TypeOf(&corev1.PersistentVolumeClaim{}) + + // Build desired size per mount path for identifying the old PVC during cleanup. + desiredSizeByMountPath := make(map[string]int64, len(desiredPvcs)) + for _, dp := range desiredPvcs { + if s := dp.Spec.Resources.Requests.Storage(); s != nil { + desiredSizeByMountPath[dp.Annotations["mountPath"]] = s.Value() + } + } + + // stateUpdates accumulates CacheVolumeStates changes to apply in a single status update. + // An empty CacheResizeState value means "delete this entry from the map". + stateUpdates := make(map[string]banzaiv1beta1.CacheResizeState) + if brokerPodExists { - // Resize complete when the replacement PVC exists but the pending-deletion PVC is already gone: - // strip the replacement annotation so the PVC is treated as normal going forward. - var hasPendingDeletion bool - for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - hasPendingDeletion = true - break + // Resize complete when the replacement PVC (size == desired) already exists at the mount + // path and no old PVC (size != desired) remains. Using a size-based check instead of a + // count-based one makes the logic crash-safe: if the reconciler crashes between writing + // pending-deletion state and creating the replacement PVC, the next cycle sees only the + // old PVC (size != desired) and correctly does NOT clear state, letting + // reconcileDesiredPvcsForBroker re-create the replacement. + for mp, state := range cacheVolumeStates { + if state != banzaiv1beta1.CacheResizePendingDeletion { + continue } - } - if !hasPendingDeletion { + desiredSize := desiredSizeByMountPath[mp] + replacementExists := false + oldPvcExists := false for _, pvc := range pvcList.Items { - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { - pvcCopy := pvc.DeepCopy() - delete(pvcCopy.Annotations, pvcCacheResizeStateAnnotation) - delete(pvcCopy.Annotations, pvcCacheResizeReplacesPVC) - if err := r.Update(ctx, pvcCopy); err != nil { - return false, errorfactory.New(errorfactory.APIFailure{}, err, - "removing replacement annotation from tiered storage cache PVC failed", "pvc", pvc.Name) + if pvc.Annotations["mountPath"] != mp { + continue + } + if pvcSize := pvc.Spec.Resources.Requests.Storage(); pvcSize != nil { + if pvcSize.Value() == desiredSize { + replacementExists = true + } else { + oldPvcExists = true } - log.Info("Tiered storage cache PVC resize complete, removed replacement annotation", "pvc", pvc.Name) } } + if replacementExists && !oldPvcExists { + stateUpdates[mp] = "" + log.Info("Tiered storage cache PVC resize complete — clearing resize state", + "brokerId", brokerId, "mountPath", mp) + } } } else { - // Broker pod is down — safe to delete any PVCs that were waiting for the pod to stop. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { - log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return false, errorfactory.New(errorfactory.APIFailure{}, err, - "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) + // Broker pod is down — safe to delete old PVCs and clean up orphaned state. + for mp, state := range cacheVolumeStates { + if state != banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if !desiredMountPaths[mp] { + // Mount path is no longer desired: delete all PVCs at this path (old + replacement). + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations["mountPath"] != mp { + continue + } + log.Info("Broker pod is down — deleting orphaned cache resize PVC", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned cache resize PVC failed", "pvc", pvc.Name) + } + } + } else { + // Mount path still desired: delete the old PVC (the one whose size differs from desired). + desiredSize := desiredSizeByMountPath[mp] + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.Annotations["mountPath"] != mp { + continue + } + if pvc.Spec.Resources.Requests.Storage().Value() != desiredSize { + log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return false, errorfactory.New(errorfactory.APIFailure{}, err, + "deleting pending-deletion tiered storage cache PVC failed", "pvc", pvc.Name) + } + break + } } } + stateUpdates[mp] = "" } - // Also delete replacement PVCs whose mount path is no longer desired — these are - // orphaned by a storage config removal that happened while a cache resize was in flight. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - log.Info("Broker pod is down — deleting orphaned replacement tiered storage cache PVC", - "brokerId", brokerId, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return false, errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned replacement tiered storage cache PVC failed", "pvc", pvc.Name) - } + + if len(stateUpdates) > 0 { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + stateUpdates, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not clear cache resize state after PVC cleanup", "brokerId", brokerId) } } // Re-list so the rest of this iteration sees the current state. @@ -1381,31 +1494,31 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( } } - // If the pod is still running and there are orphaned cache resize PVCs (storage config - // removed while a resize was in flight), trigger a rolling restart so the pod stops - // and we can delete the orphaned PVCs on the next cycle. Routing these through - // Cruise Control remove_disks would fail because cache paths are not Kafka log dirs. - if brokerPodExists { - var hasOrphanedCachePvc bool - for _, pvc := range pvcList.Items { - resizeState := pvc.Annotations[pvcCacheResizeStateAnnotation] - if (resizeState == pvcCacheResizePendingDeletion || resizeState == pvcCacheResizeReplacement) && - !desiredMountPaths[pvc.Annotations["mountPath"]] { - hasOrphanedCachePvc = true - break - } + if brokerPodExists && len(stateUpdates) > 0 { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + stateUpdates, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not clear cache resize state after resize complete", "brokerId", brokerId) } - if hasOrphanedCachePvc { - log.Info("Orphaned cache resize PVCs detected with broker pod running — triggering rolling restart for safe cleanup", - "brokerId", brokerId) - if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { - if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - banzaiv1beta1.ConfigOutOfSync, log); err != nil { - return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, - "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) + } + + // If the pod is still running and any resize state entry refers to a mount path that is no + // longer desired (storage config removed mid-resize), trigger a rolling restart so the pod + // stops and the orphaned PVCs can be deleted on the next cycle. + if brokerPodExists { + for mp := range r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates { + if !desiredMountPaths[mp] { + log.Info("Orphaned cache resize state detected with broker pod running — triggering rolling restart", + "brokerId", brokerId, "mountPath", mp) + if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + banzaiv1beta1.ConfigOutOfSync, log); err != nil { + return false, errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not mark broker ConfigOutOfSync for orphaned cache PVC cleanup", "brokerId", brokerId) + } } + return true, nil } - return true, nil // Skip disk removal — wait for pod to stop before deleting cache PVCs. } } @@ -1451,11 +1564,20 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if mountPath != pvc.Annotations[mountPathAnnotationKey] { continue } - // Skip the old PVC that is waiting to be deleted once the broker pod stops. - // The replacement PVC (same mount path) will be matched on the next iteration. - if pvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizePendingDeletion { + // Skip PVCs that are being deleted — the API server may still return them briefly + // after a Delete call while the pvc-protection finalizer is being removed. + // Matching a terminating PVC as the current PVC would cause a spurious re-staging. + if pvc.DeletionTimestamp != nil { continue } + // Skip the old PVC that is waiting to be deleted once the broker pod stops. + // We identify it as the PVC whose size differs from the desired size while a + // pending-deletion resize is in flight for this mount path. + if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if desiredPvc.Spec.Resources.Requests.Storage().Value() != pvc.Spec.Resources.Requests.Storage().Value() { + continue + } + } currentPvc = pvc.DeepCopy() alreadyCreated = true @@ -1502,10 +1624,10 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - // A replacement PVC is the new smaller PVC staged during a tiered storage cache resize. - // It already has the right size; we only need to keep the rolling upgrade running so - // the broker pod stops and the pending-deletion PVC can be cleaned up. - if currentPvc.Annotations[pvcCacheResizeStateAnnotation] == pvcCacheResizeReplacement { + // A resize is in flight for this mount path: the matched PVC is the replacement. + // Keep triggering ConfigOutOfSync so the rolling upgrade completes and the old PVC + // can be deleted on the next reconcile cycle. + if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, banzaiv1beta1.ConfigOutOfSync, log); err != nil { @@ -1579,19 +1701,17 @@ func (r *Reconciler) stageTieredCachePVCShrink( "currentSize", currentSize, "desiredSize", desiredSize) - // Annotate old PVC so it is skipped by getCreatedPvcForBroker and cleaned up - // the moment the broker pod stops. - oldPvcCopy := currentPvc.DeepCopy() - oldPvcCopy.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - if err := r.Update(ctx, oldPvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "annotating old tiered storage cache PVC for deletion failed", "pvc", currentPvc.Name) + // Record the resize in brokerState so the reconciler knows to exclude the old PVC and + // keep the rolling upgrade running until the old PVC is cleaned up. + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + map[string]banzaiv1beta1.CacheResizeState{mountPath: banzaiv1beta1.CacheResizePendingDeletion}, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not record cache resize state for tiered storage cache PVC resize", "brokerId", brokerId) } - log.Info("Marked old tiered storage cache PVC for deletion", "brokerId", brokerId, "pvc", currentPvc.Name) + log.Info("Recorded pending-deletion resize state for tiered storage cache PVC", + "brokerId", brokerId, "mountPath", mountPath, "pvc", currentPvc.Name) // Create the replacement PVC immediately so provisioning starts now. - desiredPvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizeReplacement - desiredPvc.Annotations[pvcCacheResizeReplacesPVC] = currentPvc.Name if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { return errors.WrapIf(err, "could not apply last state to annotation") } diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 64082f443..0d59680ad 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1403,24 +1403,24 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } testCases := []struct { - testName string - existingPvc *corev1.PersistentVolumeClaim - desiredPvc *corev1.PersistentVolumeClaim - existingPods []corev1.Pod - // expectedUpdatePvc: old PVC annotated pending-deletion, or annotation stripped on resize-complete - expectedUpdatePvc bool - expectedCreatePvc bool - expectedDeletePvc bool - expectedError bool + testName string + existingPvc *corev1.PersistentVolumeClaim + desiredPvc *corev1.PersistentVolumeClaim + existingPods []corev1.Pod + initialCacheVolumeState v1beta1.CacheResizeState // pre-existing brokerState for mountPath, if any + expectedUpdatePvc bool + expectedCreatePvc bool + expectedDeletePvc bool + expectedError bool }{ { - // Pod is up: annotate old PVC as pending-deletion, create replacement PVC, - // set ConfigOutOfSync to trigger rolling upgrade. No error — state is durable. - testName: "size decrease with running pod — old PVC annotated, replacement PVC created", + // Pod is up, no prior resize state: record CacheResizePendingDeletion in brokerState, + // create replacement PVC, set ConfigOutOfSync to trigger rolling upgrade. + testName: "size decrease with running pod — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{runningPod}, - expectedUpdatePvc: true, + expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, expectedError: false, @@ -1428,59 +1428,52 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { { // Terminating pod is treated as having no running pod: staging starts immediately // so the replacement PVC is provisioned during the drain window. - testName: "size decrease with terminating pod — old PVC annotated, replacement PVC created", + testName: "size decrease with terminating pod — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{terminatingPod}, - expectedUpdatePvc: true, + expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, expectedError: false, }, { - // Pod is terminating and old PVC already has pending-deletion annotation: - // the Terminating pod is treated as gone, so cleanup fires immediately. - testName: "pending-deletion PVC with terminating pod — old PVC deleted", - existingPvc: func() *corev1.PersistentVolumeClaim { - pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") - pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - return pvc - }(), - desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), - existingPods: []corev1.Pod{terminatingPod}, - expectedUpdatePvc: false, - expectedCreatePvc: true, - expectedDeletePvc: true, - expectedError: false, + // Resize state already recorded and pod is terminating (treated as gone): + // cleanup fires — old PVC (larger size) is deleted, state is cleared. + testName: "pending-deletion state with terminating pod — old PVC deleted", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{terminatingPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, }, { - // Pod is already gone on first detection: stage the replacement anyway so the - // state machine is consistent. Cleanup of pending-deletion PVC happens on the - // next cycle once the annotation is observed. - testName: "size decrease with pod already gone — old PVC annotated, replacement PVC created", + // Pod is already gone, no prior resize state: record state and create replacement PVC. + // Cleanup of old PVC happens on next cycle when reconciler re-observes the state. + testName: "size decrease with pod already gone — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{}, - expectedUpdatePvc: true, + expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, expectedError: false, }, { - // Pod is gone and old PVC already has pending-deletion annotation: - // cleanup fires — old PVC is deleted. Replacement PVC already exists. - testName: "size decrease with pod gone and old PVC pending-deletion — old PVC deleted", - existingPvc: func() *corev1.PersistentVolumeClaim { - pvc := makeTieredCachePvc("cache-pvc-1", "100Gi") - pvc.Annotations[pvcCacheResizeStateAnnotation] = pvcCacheResizePendingDeletion - return pvc - }(), - desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), - existingPods: []corev1.Pod{}, - expectedUpdatePvc: false, - expectedCreatePvc: true, - expectedDeletePvc: true, - expectedError: false, + // Resize state already recorded and pod is gone: + // cleanup fires — old PVC (larger size) is deleted, state is cleared. + testName: "pending-deletion state with pod gone — old PVC deleted", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: true, + expectedError: false, }, { // size increase — no special handling, regular PVC update path. @@ -1503,18 +1496,28 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { mockClient := mocks.NewMockClient(mockCtrl) mockSubResourceClient := mocks.NewMockSubResourceClient(mockCtrl) - r := Reconciler{ - Reconciler: resources.Reconciler{ - Client: mockClient, - KafkaCluster: &v1beta1.KafkaCluster{ - ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: namespace}, - Spec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{ - Id: 0, - BrokerConfig: &v1beta1.BrokerConfig{Roles: []string{"broker"}}, - }}, + kafkaCluster := &v1beta1.KafkaCluster{ + ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: namespace}, + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{ + Id: 0, + BrokerConfig: &v1beta1.BrokerConfig{Roles: []string{"broker"}}, + }}, + }, + } + if test.initialCacheVolumeState != "" { + kafkaCluster.Status.BrokersState = map[string]v1beta1.BrokerState{ + brokerId: { + CacheVolumeStates: map[string]v1beta1.CacheResizeState{ + mountPath: test.initialCacheVolumeState, }, }, + } + } + r := Reconciler{ + Reconciler: resources.Reconciler{ + Client: mockClient, + KafkaCluster: kafkaCluster, }, } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index b45ffdb04..0256775f8 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -37,11 +37,6 @@ const ( tsResizeInitialSize = "2Gi" tsResizeShrunkSize = "1Gi" - // Annotation keys written by the cache-resize reconciler. - pvcResizeStateAnnotation = "koperator.adobe.com/cache-resize-state" - pvcResizeStatePending = "pending-deletion" - pvcResizeStateReplacement = "replacement" - tsResizePhaseTimeout = 10 * time.Minute tsResizePollingInterval = 15 * time.Second @@ -53,6 +48,36 @@ type pvcItem struct { Name string Annotations map[string]string StorageSize string + Phase string +} + +// getCacheResizeState returns the CacheVolumeStates entry for the given broker and mount path +// from the KafkaCluster CR status, or an empty string if not set. +func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerID, mountPath string) (string, error) { + rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + "get", kafkaKind, clusterName, + "--output", "json", + ) + if err != nil { + return "", errors.WrapIf(err, "getting KafkaCluster failed") + } + + var cr struct { + Status struct { + BrokersState map[string]struct { + CacheVolumeStates map[string]string `json:"cacheVolumeStates"` + } `json:"brokersState"` + } `json:"status"` + } + if err := json.Unmarshal([]byte(rawOutput), &cr); err != nil { + return "", errors.WrapIf(err, "parsing KafkaCluster JSON failed") + } + + brokerState, ok := cr.Status.BrokersState[brokerID] + if !ok { + return "", nil + } + return brokerState.CacheVolumeStates[mountPath], nil } // listBrokerCachePVCs returns PVCs for broker tsResizeBrokerID that have the @@ -82,6 +107,9 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { } `json:"requests"` } `json:"resources"` } `json:"spec"` + Status struct { + Phase string `json:"phase"` + } `json:"status"` } `json:"items"` } if err := json.Unmarshal([]byte(rawOutput), &pvcList); err != nil { @@ -95,6 +123,7 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { Name: item.Metadata.Name, Annotations: item.Metadata.Annotations, StorageSize: item.Spec.Resources.Requests.Storage, + Phase: item.Status.Phase, }) } } @@ -265,33 +294,31 @@ func testTieredStorageCachePvcResize() bool { applyK8sResourceManifest(kubectlOptions, tsResizeShrunkManifest) }) - ginkgo.It("Phase 1: old PVC annotated pending-deletion and replacement PVC created", func() { - ginkgo.By("Waiting until both pending-deletion and replacement PVCs coexist for broker 0") + ginkgo.It("Phase 1: resize state recorded in brokerState and replacement PVC created", func() { + ginkgo.By("Waiting until CacheVolumeStates has pending-deletion and two PVCs coexist for broker 0") gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions) + state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, + fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) if err != nil { return err } - var hasPendingDeletion, hasReplacement bool - for _, pvc := range pvcs { - switch pvc.Annotations[pvcResizeStateAnnotation] { - case pvcResizeStatePending: - hasPendingDeletion = true - case pvcResizeStateReplacement: - hasReplacement = true - } + if state != "pending-deletion" { + return fmt.Errorf("expected cacheVolumeStates[%s]=%q, got %q", + tsResizeCacheMountPath, "pending-deletion", state) } - if !hasPendingDeletion { - return errors.New("no PVC with pending-deletion annotation yet") + pvcs, err := listBrokerCachePVCs(kubectlOptions) + if err != nil { + return err } - if !hasReplacement { - return errors.New("no PVC with replacement annotation yet") + if len(pvcs) < 2 { + return fmt.Errorf("expected 2 cache PVCs (old + replacement) for broker %d, got %d", + tsResizeBrokerID, len(pvcs)) } return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) - ginkgo.It("Phase 2: broker pod restarts, pending-deletion PVC is deleted", func() { + ginkgo.It("Phase 2: broker pod restarts, old PVC is deleted, resize state is cleared", func() { ginkgo.By("Waiting for broker-0 pod to be recycled (UID change indicates rolling restart)") // We detect recycling by UID change rather than waiting for the pod count to hit zero. // The pod may restart fast enough to be back before the next polling tick, causing a @@ -308,42 +335,53 @@ func testTieredStorageCachePvcResize() bool { return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) - ginkgo.By("Waiting for the pending-deletion PVC to be deleted") + ginkgo.By("Waiting for old PVC to be deleted and resize state to be cleared") gomega.Eventually(context.Background(), func() error { + // The old (larger) PVC should be gone — only the replacement should remain. pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } + activePvcs := make([]pvcItem, 0, len(pvcs)) for _, pvc := range pvcs { - if pvc.Annotations[pvcResizeStateAnnotation] == pvcResizeStatePending { - return fmt.Errorf("pending-deletion PVC %s still exists", pvc.Name) + // Ignore PVCs that are being deleted (DeletionTimestamp set but not yet gone). + if pvc.Phase != "" { + activePvcs = append(activePvcs, pvc) } } + if len(activePvcs) != 1 { + return fmt.Errorf("expected 1 active cache PVC for broker %d after pod restart, got %d", + tsResizeBrokerID, len(activePvcs)) + } + // The resize state should be cleared once the old PVC is gone. + state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, + fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) + if err != nil { + return err + } + if state != "" { + return fmt.Errorf("expected cacheVolumeStates[%s] to be cleared, got %q", + tsResizeCacheMountPath, state) + } return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) - ginkgo.It("Phase 3: broker pod running again with new PVC, replacement annotation stripped", func() { + ginkgo.It("Phase 3: broker pod running again with the new smaller PVC", func() { ginkgo.By("Waiting for the broker-0 pod to come back Ready") err = waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", tsResizePhaseTimeout, fmt.Sprintf("%s=%s,brokerId=0,app=kafka", kafkaCRLabelKey, tsResizeClusterName), "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) - ginkgo.By("Waiting for replacement annotation to be stripped from the surviving PVC") - gomega.Eventually(context.Background(), func() error { - pvcs, err := listBrokerCachePVCs(kubectlOptions) - if err != nil { - return err - } - if len(pvcs) != 1 { - return fmt.Errorf("expected 1 cache PVC for broker 0, got %d", len(pvcs)) - } - if state := pvcs[0].Annotations[pvcResizeStateAnnotation]; state != "" { - return fmt.Errorf("cache-resize-state annotation still present: %q", state) - } - return nil - }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) + ginkgo.By("Verifying exactly one cache PVC remains with no resize state") + pvcs, err := listBrokerCachePVCs(kubectlOptions) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(pvcs).To(gomega.HaveLen(1), "expected exactly one cache PVC after resize completes") + state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, + fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(state).To(gomega.BeEmpty(), "cacheVolumeStates entry should be cleared after resize") }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { From 2ebc9bee6a7e1d26607ac0ce9c711171791e4ed9 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Tue, 7 Apr 2026 14:40:05 +0300 Subject: [PATCH 08/21] lint, test timeout and generated manifest --- api/v1beta1/zz_generated.deepcopy.go | 7 + charts/kafka-operator/crds/kafkaclusters.yaml | 11 +- .../kafka.banzaicloud.io_kafkaclusters.yaml | 11 +- pkg/resources/kafka/kafka.go | 165 ++++++++++-------- tests/e2e/test_multidisk_removal.go | 11 +- 5 files changed, 120 insertions(+), 85 deletions(-) diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index d5cfaf00d..2829a1d40 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -209,6 +209,13 @@ func (in *BrokerState) DeepCopyInto(out *BrokerState) { *out = make(ExternalListenerConfigNames, len(*in)) copy(*out, *in) } + if in.CacheVolumeStates != nil { + in, out := &in.CacheVolumeStates, &out.CacheVolumeStates + *out = make(map[string]CacheResizeState, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BrokerState. diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index 273ec3028..f1859b527 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -23094,13 +23094,12 @@ spec: properties: cacheVolumeStates: additionalProperties: - description: CacheResizeState tracks the resize lifecycle of - a tiered storage cache PVC for a given mount path. + description: CacheResizeState tracks the resize lifecycle + of a tiered storage cache PVC for a given mount path. type: string - description: CacheVolumeStates tracks in-flight tiered storage - cache PVC resize operations, keyed by mount path. An entry - is present only while a resize is in progress; it is cleared - once cleanup completes. + description: |- + CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. + An entry is present only while a resize is in progress; it is cleared once cleanup completes. type: object configurationBackup: description: Compressed data from broker configuration to restore diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index 273ec3028..f1859b527 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -23094,13 +23094,12 @@ spec: properties: cacheVolumeStates: additionalProperties: - description: CacheResizeState tracks the resize lifecycle of - a tiered storage cache PVC for a given mount path. + description: CacheResizeState tracks the resize lifecycle + of a tiered storage cache PVC for a given mount path. type: string - description: CacheVolumeStates tracks in-flight tiered storage - cache PVC resize operations, keyed by mount path. An entry - is present only while a resize is in progress; it is cleared - once cleanup completes. + description: |- + CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. + An entry is present only while a resize is in progress; it is cleared once cleanup completes. type: object configurationBackup: description: Compressed data from broker configuration to restore diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 5b9684393..b7de5c815 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1244,60 +1244,13 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } - // Delete tiered storage cache PVCs whose mount path is no longer desired. - // These are not Kafka log dirs so they must never go through CC disk removal. - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - if !desiredMountPaths[pvc.Annotations["mountPath"]] { - log.Info("Deleting removed tiered storage cache PVC", - "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) - } - } + if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { + return err } - // Delete duplicate non-terminating cache PVCs at desired mount paths when no resize is in - // flight. These are orphaned replacements from a double-staging race (two reconcile cycles - // both staged a replacement before the first deletion propagated). Keep only the Bound one; - // delete any Pending duplicates. Safe only when the broker pod is down. if !brokerPodExists { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - boundByMountPath := make(map[string]bool) - for _, pvc := range pvcList.Items { - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - if pvc.Status.Phase == corev1.ClaimBound { - boundByMountPath[mp] = true - } - } - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - // Delete a Pending duplicate only when a Bound PVC already exists at this path. - if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { - log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", - "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) - } - } + if err := r.cleanupOrphanedDuplicateCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { + return err } } @@ -1306,24 +1259,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro return errorfactory.New(errorfactory.APIFailure{}, err, "getting resource failed", "kind", desiredType) } - // Handle disk removal — for mount paths with an in-flight cache resize two PVCs temporarily - // coexist (old pending-deletion + new replacement). Count each such path only once to avoid - // a false-positive disk-removal trigger. - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - countedCacheMountPaths := make(map[string]bool) - effectivePvcCount := 0 - for _, pvc := range pvcList.Items { - mp := pvc.Annotations["mountPath"] - if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - if !countedCacheMountPaths[mp] { - countedCacheMountPaths[mp] = true - effectivePvcCount++ - } - } else { - effectivePvcCount++ - } - } - if effectivePvcCount > len(desiredPvcs) { + if r.effectivePvcCount(brokerId, pvcList) > len(desiredPvcs) { waitForDiskRemovalToFinish, err = handleDiskRemoval(ctx, pvcList, desiredPvcs, r, brokerId, log, desiredType, brokerVolumesState) if err != nil { return err @@ -1353,6 +1289,97 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro return nil } +// deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired. +// These volumes are not Kafka log dirs and must never go through CC disk removal. +func (r *Reconciler) deleteRemovedCachePVCs( + ctx context.Context, + log logr.Logger, + brokerId string, + pvcList *corev1.PersistentVolumeClaimList, + desiredMountPaths map[string]bool, +) error { + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + if !desiredMountPaths[pvc.Annotations["mountPath"]] { + log.Info("Deleting removed tiered storage cache PVC", + "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) + } + } + } + return nil +} + +// cleanupOrphanedDuplicateCachePVCs removes Pending cache PVCs when a Bound PVC already exists at +// the same mount path and no resize is in flight. These are orphaned replacements left by a +// double-staging race. Safe to call only when the broker pod is down. +func (r *Reconciler) cleanupOrphanedDuplicateCachePVCs( + ctx context.Context, + log logr.Logger, + brokerId string, + pvcList *corev1.PersistentVolumeClaimList, + desiredMountPaths map[string]bool, +) error { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + boundByMountPath := make(map[string]bool) + for _, pvc := range pvcList.Items { + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if pvc.Status.Phase == corev1.ClaimBound { + boundByMountPath[mp] = true + } + } + for i := range pvcList.Items { + pvc := &pvcList.Items[i] + if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + continue + } + mp := pvc.Annotations["mountPath"] + if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + continue + } + if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { + log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", + "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) + if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { + return errorfactory.New(errorfactory.APIFailure{}, err, + "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) + } + } + } + return nil +} + +// effectivePvcCount returns the number of PVCs for a broker, counting each pending-deletion cache +// mount path only once (old + replacement coexist during a shrink but represent one logical disk). +func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.PersistentVolumeClaimList) int { + cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + counted := make(map[string]bool) + total := 0 + for _, pvc := range pvcList.Items { + mp := pvc.Annotations["mountPath"] + if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + if !counted[mp] { + counted[mp] = true + total++ + } + } else { + total++ + } + } + return total +} + // getBrokerPodExists returns true if a non-terminating pod exists for the given broker. func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (bool, error) { brokerPodList := &corev1.PodList{} diff --git a/tests/e2e/test_multidisk_removal.go b/tests/e2e/test_multidisk_removal.go index 110d58d8a..a02db08e7 100644 --- a/tests/e2e/test_multidisk_removal.go +++ b/tests/e2e/test_multidisk_removal.go @@ -33,9 +33,10 @@ import ( ) const ( - multidiskRemovalTimeout = 1800 * time.Second // this test can take long: rebalance must finish before removal starts - multidiskRemovalPollInterval = 5 * time.Second - brokerConfigTemplateFormat = "%s-config-%d" + multidiskRemovalTimeout = 1800 * time.Second // this test can take long + multidiskRemovalPollInterval = 5 * time.Second + multidiskRemovalBrokerReadinessWait = 360 * time.Second // rolling restart of all brokers after disk removal + brokerConfigTemplateFormat = "%s-config-%d" ) var ( @@ -75,7 +76,9 @@ func testMultiDiskRemoval() bool { }) ginkgo.It("Asserting Kafka brokers remain healthy", func() { - err := waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", defaultPodReadinessWaitTime, + // Use multidiskRemovalBrokerReadinessWait: after disk removal, CC triggers a rolling + // restart of all brokers, so 180s (defaultPodReadinessWaitTime) is too tight in kind. + err := waitK8sResourceCondition(kubectlOptions, "pod", "condition=Ready", multidiskRemovalBrokerReadinessWait, v1beta1.KafkaCRLabelKey+"="+kafkaClusterName+",app=kafka", "") gomega.Expect(err).NotTo(gomega.HaveOccurred()) }) From b029d7d795c50c3f5b1435516111b9194c68efbd Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 8 Apr 2026 15:53:57 +0300 Subject: [PATCH 09/21] fix e2e test --- pkg/resources/kafka/kafka.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index b7de5c815..dac854386 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1665,14 +1665,6 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - if !k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { - continue - } - - if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { - return errors.WrapIf(err, "could not apply last state to annotation") - } - // Check if this is a tiered storage cache volume. Fall back to the desired PVC annotation // for PVCs created before the tieredStorageCache annotation was introduced. isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue || @@ -1690,6 +1682,14 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } + if !k8sutil.CheckIfObjectUpdated(log, desiredType, currentPvc, desiredPvc) { + continue + } + + if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { + return errors.WrapIf(err, "could not apply last state to annotation") + } + // Regular validation: size decreases are forbidden for non-cache volumes. if isDesiredStorageValueInvalid(desiredPvc, currentPvc) { return errorfactory.New(errorfactory.InternalError{}, errors.New("could not modify pvc size"), From 1012d182ca3f54303a4c41537ff9ea7716398d19 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 6 May 2026 18:22:09 +0300 Subject: [PATCH 10/21] fix: prevent data loss via tieredStorageCache spec flip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A KafkaCluster CR writer could flip tieredStorageCache from false to true on an existing storageConfigs entry, causing the operator to silently exclude the volume from log.dirs and Cruise Control capacity, and subsequently route a shrink through the delete-and-recreate path that bypasses graceful disk drain — destroying live log data. Fix in three layers: 1. Validating webhook rejects flips on existing storageConfigs entries (matched per mountPath, scoped per BrokerConfigGroup and per Broker). 2. deleteRemovedCachePVCs is now gated on !brokerPodExists so a removal while the broker pod is still running cannot issue a Delete against a mounted PVC. 3. The tiered-cache classification trusts only the persisted PVC annotation, never the desired-spec annotation. The annotation backfill block is removed, since it converted the unprotected CR field into a persistent ground-truth tag on live data PVCs. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/resources/kafka/kafka.go | 36 ++--- pkg/webhooks/errors.go | 5 + pkg/webhooks/kafkacluster_validator.go | 62 ++++++++- pkg/webhooks/kafkacluster_validator_test.go | 139 ++++++++++++++++++++ 4 files changed, 216 insertions(+), 26 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index dac854386..a9bc9376b 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1244,11 +1244,13 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro continue } - if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { - return err - } - + // Only delete cache PVCs when the broker pod is gone — issuing Delete against a mounted + // PVC sets the DeletionTimestamp and is held by the pvc-protection finalizer, but the + // intent is wrong and the resulting PVC state confuses subsequent reconciles. if !brokerPodExists { + if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { + return err + } if err := r.cleanupOrphanedDuplicateCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { return err } @@ -1608,23 +1610,6 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( currentPvc = pvc.DeepCopy() alreadyCreated = true - // Backfill the tieredStorageCache annotation if the desired PVC has it but the existing - // PVC does not — handles PVCs created before the annotation was introduced. - if desiredPvc.Annotations["tieredStorageCache"] == annotationTrue && - currentPvc.Annotations["tieredStorageCache"] != annotationTrue { - pvcCopy := currentPvc.DeepCopy() - if pvcCopy.Annotations == nil { - pvcCopy.Annotations = make(map[string]string) - } - pvcCopy.Annotations["tieredStorageCache"] = annotationTrue - if err := r.Update(ctx, pvcCopy); err != nil { - return errorfactory.New(errorfactory.APIFailure{}, err, - "backfilling tieredStorageCache annotation on existing PVC failed", "pvc", pvcCopy.Name) - } - log.Info("Backfilled tieredStorageCache annotation on existing PVC", "pvc", pvcCopy.Name) - currentPvc = pvcCopy - } - // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. if currentPvc.Annotations["tieredStorageCache"] != annotationTrue { @@ -1665,10 +1650,11 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - // Check if this is a tiered storage cache volume. Fall back to the desired PVC annotation - // for PVCs created before the tieredStorageCache annotation was introduced. - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue || - desiredPvc.Annotations["tieredStorageCache"] == annotationTrue + // Trust only the persisted PVC annotation when classifying — never the desired-spec + // annotation. Reading from the desired side would let a CR-edit (flipping + // tieredStorageCache: false → true on an existing data volume) route a real log-dir PVC + // through the cache-shrink delete-and-recreate path, bypassing graceful disk drain. + isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() diff --git a/pkg/webhooks/errors.go b/pkg/webhooks/errors.go index 58f3d9ef1..0938847e5 100644 --- a/pkg/webhooks/errors.go +++ b/pkg/webhooks/errors.go @@ -29,6 +29,7 @@ const ( unsupportedRemovingStorageMsg = "removing storage from a broker is not supported" invalidExternalListenerStartingPortErrMsg = "invalid external listener starting port number" invalidContainerPortForIngressControllerErrMsg = "invalid trarget port number for ingress controller deployment" + immutableTieredStorageCacheErrMsg = "tieredStorageCache is immutable on existing storageConfigs entries; remove the entry and re-add to change" // errorDuringValidationMsg is added to infrastructure errors (e.g. failed to connect), but not to field validation errors errorDuringValidationMsg = "error during validation" @@ -62,6 +63,10 @@ func IsAdmissionInvalidExternalListenerPort(err error) bool { return apierrors.IsInvalid(err) && strings.Contains(err.Error(), invalidExternalListenerStartingPortErrMsg) } +func IsAdmissionImmutableTieredStorageCache(err error) bool { + return apierrors.IsInvalid(err) && strings.Contains(err.Error(), immutableTieredStorageCacheErrMsg) +} + func IsAdmissionErrorDuringValidation(err error) bool { return apierrors.IsInternalError(err) && strings.Contains(err.Error(), errorDuringValidationMsg) } diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index 486db357e..251e5a0b0 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -24,6 +24,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/validation/field" "github.com/go-logr/logr" @@ -36,8 +37,10 @@ type KafkaClusterValidator struct { Log logr.Logger } -func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, _, kafkaClusterNew *banzaicloudv1beta1.KafkaCluster) (warnings admission.Warnings, err error) { +func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (warnings admission.Warnings, err error) { var allErrs field.ErrorList + kafkaClusterOld := oldObj.(*banzaicloudv1beta1.KafkaCluster) + kafkaClusterNew := newObj.(*banzaicloudv1beta1.KafkaCluster) log := s.Log.WithValues("name", kafkaClusterNew.GetName(), "namespace", kafkaClusterNew.GetNamespace()) listenerErrs := checkInternalAndExternalListeners(&kafkaClusterNew.Spec) @@ -45,6 +48,8 @@ func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, _, kafkaClust allErrs = append(allErrs, listenerErrs...) } + allErrs = append(allErrs, checkTieredStorageCacheImmutability(&kafkaClusterOld.Spec, &kafkaClusterNew.Spec)...) + if len(allErrs) == 0 { return nil, nil } @@ -78,6 +83,61 @@ func (s KafkaClusterValidator) ValidateDelete(_ context.Context, _ *banzaicloudv return nil, nil } +// checkTieredStorageCacheImmutability rejects updates that flip the TieredStorageCache flag on an +// existing storageConfigs entry (matched by mountPath, scoped per brokerConfigGroup or per broker). +// Flipping false→true on a live data PVC silently removes it from log.dirs and Cruise Control +// capacity, and enables a delete-and-recreate shrink path that bypasses graceful disk drain — an +// irreversible data-loss vector when applied to a volume that already holds Kafka log segments. +// To change this property, callers must remove the entry and re-add it in a subsequent apply. +func checkTieredStorageCacheImmutability(oldSpec, newSpec *banzaicloudv1beta1.KafkaClusterSpec) field.ErrorList { + var allErrs field.ErrorList + + for groupName, newGroup := range newSpec.BrokerConfigGroups { + oldGroup, ok := oldSpec.BrokerConfigGroups[groupName] + if !ok { + continue + } + groupPath := field.NewPath("spec").Child("brokerConfigGroups").Key(groupName).Child("storageConfigs") + allErrs = append(allErrs, diffTieredStorageCache(oldGroup.StorageConfigs, newGroup.StorageConfigs, groupPath)...) + } + + oldByID := make(map[int32]banzaicloudv1beta1.Broker, len(oldSpec.Brokers)) + for _, b := range oldSpec.Brokers { + oldByID[b.Id] = b + } + for i, newBroker := range newSpec.Brokers { + oldBroker, ok := oldByID[newBroker.Id] + if !ok || oldBroker.BrokerConfig == nil || newBroker.BrokerConfig == nil { + continue + } + brokerPath := field.NewPath("spec").Child("brokers").Index(i).Child("brokerConfig").Child("storageConfigs") + allErrs = append(allErrs, diffTieredStorageCache(oldBroker.BrokerConfig.StorageConfigs, newBroker.BrokerConfig.StorageConfigs, brokerPath)...) + } + + return allErrs +} + +func diffTieredStorageCache(oldConfigs, newConfigs []banzaicloudv1beta1.StorageConfig, basePath *field.Path) field.ErrorList { + var allErrs field.ErrorList + oldByPath := make(map[string]banzaicloudv1beta1.StorageConfig, len(oldConfigs)) + for _, sc := range oldConfigs { + oldByPath[sc.MountPath] = sc + } + for i, newSC := range newConfigs { + oldSC, ok := oldByPath[newSC.MountPath] + if !ok { + continue + } + if oldSC.TieredStorageCache != newSC.TieredStorageCache { + allErrs = append(allErrs, field.Forbidden( + basePath.Index(i).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + )) + } + } + return allErrs +} + // checkListeners validates the spec.listenersConfig object func checkInternalAndExternalListeners(kafkaClusterSpec *banzaicloudv1beta1.KafkaClusterSpec) field.ErrorList { var allErrs field.ErrorList diff --git a/pkg/webhooks/kafkacluster_validator_test.go b/pkg/webhooks/kafkacluster_validator_test.go index 952c3f389..6193a9a8b 100644 --- a/pkg/webhooks/kafkacluster_validator_test.go +++ b/pkg/webhooks/kafkacluster_validator_test.go @@ -258,6 +258,145 @@ func TestCheckExternalListenerStartingPort(t *testing.T) { } } +func TestCheckTieredStorageCacheImmutability(t *testing.T) { + mp := func(path string, tc bool) v1beta1.StorageConfig { + return v1beta1.StorageConfig{MountPath: path, TieredStorageCache: tc} + } + + testCases := []struct { + testName string + oldSpec v1beta1.KafkaClusterSpec + newSpec v1beta1.KafkaClusterSpec + expected field.ErrorList + }{ + { + testName: "no change — empty", + oldSpec: v1beta1.KafkaClusterSpec{}, + newSpec: v1beta1.KafkaClusterSpec{}, + expected: nil, + }, + { + testName: "brokerConfigGroups: flip false→true on existing mountPath rejected", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}}, + }, + }, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + testName: "brokerConfigGroups: flip true→false on existing mountPath rejected", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, + }, + }, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + testName: "brokerConfigGroups: new entry with tieredStorageCache=true allowed", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{ + mp("/kafka-logs/0", false), + mp("/cache/0", true), + }}, + }, + }, + expected: nil, + }, + { + testName: "brokerConfigGroups: remove entry, then re-add with different value — allowed (no entry to match against in old at the same mountPath after removal scenario; test single-pass: removed entry simply not in new)", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, + }, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{}}, + }, + }, + expected: nil, + }, + { + testName: "brokers[].brokerConfig: flip false→true rejected", + oldSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}, + }}}, + }, + newSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}, + }}}, + }, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + testName: "brokers[].brokerConfig: nil old config — no error (entry didn't exist)", + oldSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: nil}}, + }, + newSpec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}, + }}}, + }, + expected: nil, + }, + { + testName: "brokerConfigGroup not present in old — no error (whole group is new)", + oldSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{}, + }, + newSpec: v1beta1.KafkaClusterSpec{ + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "new-group": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, + }, + }, + expected: nil, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.testName, func(t *testing.T) { + got := checkTieredStorageCacheImmutability(&testCase.oldSpec, &testCase.newSpec) + require.Equal(t, testCase.expected, got) + }) + } +} + func TestCheckTargetPortsCollisionForEnvoy(t *testing.T) { testCases := []struct { testName string From bb8094c7afc830803b7b394aaec3f5516e167ec6 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 6 May 2026 18:23:28 +0300 Subject: [PATCH 11/21] fix: eliminate cache PVC duplication and orphan-cleanup race Two related correctness issues in the tiered storage cache resize path: (HIGH-3) When the client times out on the replacement-PVC Create but the server has already accepted the request, the next reconcile cycle's cache hasn't observed the new PVC yet. The size filter in reconcileDesiredPvcsForBroker then excludes the old PVC, alreadyCreated remains false, and a second Create with a fresh GenerateName produces a duplicate replacement. Add a strongly-consistent (DirectClient) read before falling through to Create, gated on CacheResizePendingDeletion state at this mountPath. If a replacement at the desired size already exists, reuse it. (HIGH-2) cleanupOrphanedDuplicateCachePVCs existed specifically to mop up the duplicates produced by the staging race above. Its own implementation had a cache-lag bug: after handleBrokerCacheResizeCleanup deleted the old PVC and cleared the in-memory state in the same cycle, the cached PVC list still showed the deleted PVC as Bound (no DeletionTimestamp yet), so the function classified the legitimate replacement as a duplicate and deleted it. With the staging race closed, the cleanup function has no remaining purpose, and keeping a function whose implementation has a race that deletes legitimate replacements is worse than removing it. Delete the function and its call site. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/resources/kafka/kafka.go | 76 +++++++++++++----------------------- 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index a9bc9376b..20cb75d04 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1251,9 +1251,6 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro if err := r.deleteRemovedCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { return err } - if err := r.cleanupOrphanedDuplicateCachePVCs(ctx, log, brokerId, pvcList, desiredMountPaths); err != nil { - return err - } } // Re-list so the disk removal count below reflects the deletions above. @@ -1317,51 +1314,6 @@ func (r *Reconciler) deleteRemovedCachePVCs( return nil } -// cleanupOrphanedDuplicateCachePVCs removes Pending cache PVCs when a Bound PVC already exists at -// the same mount path and no resize is in flight. These are orphaned replacements left by a -// double-staging race. Safe to call only when the broker pod is down. -func (r *Reconciler) cleanupOrphanedDuplicateCachePVCs( - ctx context.Context, - log logr.Logger, - brokerId string, - pvcList *corev1.PersistentVolumeClaimList, - desiredMountPaths map[string]bool, -) error { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - boundByMountPath := make(map[string]bool) - for _, pvc := range pvcList.Items { - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - if pvc.Status.Phase == corev1.ClaimBound { - boundByMountPath[mp] = true - } - } - for i := range pvcList.Items { - pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { - continue - } - mp := pvc.Annotations["mountPath"] - if !desiredMountPaths[mp] || cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { - continue - } - if pvc.Status.Phase == corev1.ClaimPending && boundByMountPath[mp] { - log.Info("Deleting orphaned Pending cache PVC (duplicate at same mount path)", - "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) - if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { - return errorfactory.New(errorfactory.APIFailure{}, err, - "deleting orphaned Pending cache PVC failed", "pvc", pvc.Name) - } - } - } - return nil -} - // effectivePvcCount returns the number of PVCs for a broker, counting each pending-deletion cache // mount path only once (old + replacement coexist during a shrink but represent one logical disk). func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.PersistentVolumeClaimList) int { @@ -1626,6 +1578,34 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( } if !alreadyCreated { + // During an in-flight cache resize, a prior reconcile cycle may have already created + // the replacement PVC server-side even if the client got a timeout, or the cached + // client may not yet have observed an in-flight Create. The size filter above + // excludes the old PVC, so alreadyCreated is false even when a replacement exists. + // A strongly-consistent (uncached) read picks up that replacement so we don't issue a + // second Create with a fresh GenerateName. + if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + liveList := &corev1.PersistentVolumeClaimList{} + if err := r.DirectClient.List(ctx, liveList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { + return errorfactory.New(errorfactory.APIFailure{}, err, "uncached list of PVCs failed", "kind", desiredType) + } + desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + for i := range liveList.Items { + p := &liveList.Items[i] + if p.DeletionTimestamp != nil || p.Annotations["mountPath"] != mountPath { + continue + } + if p.Spec.Resources.Requests.Storage().Value() == desiredSize { + log.Info("Replacement cache PVC already exists from a prior partial attempt; skipping Create", + "brokerId", brokerId, "mountPath", mountPath, "pvc", p.Name) + alreadyCreated = true + break + } + } + if alreadyCreated { + continue + } + } // Creating the 2+ PersistentVolumes for Pod if err := patch.DefaultAnnotator.SetLastAppliedAnnotation(desiredPvc); err != nil { return errors.WrapIf(err, "could not apply last state to annotation") From e4b5a3936004b8b25e22dda260f4cdd49aa4ff59 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 10:36:11 +0300 Subject: [PATCH 12/21] refactor: replace tieredStorageCache annotation key literal with constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PVC annotation key "tieredStorageCache" was a bare string literal at five sites across pvc.go and kafka.go (write at PVC creation, three reads gating cache-specific behavior, one test fixture). A typo at any read site would silently misclassify a cache PVC as a regular log-dir PVC, sending it through Cruise Control rebalance — silent data loss surface. Define TieredStorageCacheAnnotationKey in api/v1beta1 alongside the existing label/annotation key constants and reference it everywhere. Compile-time-checked from now on. Co-Authored-By: Claude Opus 4.7 (1M context) --- api/v1beta1/kafkacluster_types.go | 6 ++++++ pkg/resources/kafka/kafka.go | 6 +++--- pkg/resources/kafka/kafka_test.go | 4 ++-- pkg/resources/kafka/pvc.go | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/api/v1beta1/kafkacluster_types.go b/api/v1beta1/kafkacluster_types.go index d636c0570..89ada878a 100644 --- a/api/v1beta1/kafkacluster_types.go +++ b/api/v1beta1/kafkacluster_types.go @@ -50,6 +50,12 @@ const ( // IsControllerNodeKey is used to identify if the kafka pod is a controller or broker_controller IsControllerNodeKey = "isControllerNode" + // TieredStorageCacheAnnotationKey marks a PVC as backing a Kafka tiered storage cache. + // Cache PVCs are excluded from log.dirs and Cruise Control capacity, and follow a + // delete-and-recreate path on shrink. Read sites must use this constant rather than the + // literal so a typo cannot silently misclassify a cache PVC as a regular log-dir volume. + TieredStorageCacheAnnotationKey = "tieredStorageCache" + // DefaultCruiseControlImage is the default CC image used when users don't specify it in CruiseControlConfig.Image DefaultCruiseControlImage = "adobe/cruise-control:3.0.3-adbe-20250804" // renovate: datasource=docker depName=adobe/cruise-control diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 20cb75d04..a02288e63 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1299,7 +1299,7 @@ func (r *Reconciler) deleteRemovedCachePVCs( ) error { for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations["tieredStorageCache"] != annotationTrue { + if pvc.DeletionTimestamp != nil || pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { continue } if !desiredMountPaths[pvc.Annotations["mountPath"]] { @@ -1564,7 +1564,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. - if currentPvc.Annotations["tieredStorageCache"] != annotationTrue { + if currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1634,7 +1634,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // annotation. Reading from the desired side would let a CR-edit (flipping // tieredStorageCache: false → true on an existing data volume) route a real log-dir PVC // through the cache-shrink delete-and-recreate path, bypassing graceful disk drain. - isTieredCache := currentPvc.Annotations["tieredStorageCache"] == annotationTrue + isTieredCache := currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 0d59680ad..2ef2d2176 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1370,8 +1370,8 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { v1beta1.KafkaCRLabelKey: clusterName, }, Annotations: map[string]string{ - "mountPath": mountPath, - "tieredStorageCache": "true", + "mountPath": mountPath, + v1beta1.TieredStorageCacheAnnotationKey: "true", }, }, Spec: corev1.PersistentVolumeClaimSpec{ diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index fcd80b101..0451bf24a 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -66,7 +66,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora // Mark tiered storage cache PVCs with annotation for special handling if storage.TieredStorageCache { - annotations["tieredStorageCache"] = annotationTrue + annotations[v1beta1.TieredStorageCacheAnnotationKey] = annotationTrue } return &corev1.PersistentVolumeClaim{ From 36264708859798a78ba0bdd86dcf08ea6b89dd2a Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 11:15:26 +0300 Subject: [PATCH 13/21] test: cover crash recovery and idempotent re-staging for cache resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three test cases to TestReconcileKafkaPvcTieredCacheResize covering the recovery paths exercised by the HIGH-3 idempotency fix: 1. Pending-deletion state with running pod and no replacement — the reconciler crashed between writing state and creating the replacement. Asserts: replacement is re-staged, old PVC is not deleted, state is preserved (since the resize hasn't completed). 2. Pending-deletion with replacement present (symmetric) — the inner-match loop's size filter correctly identifies the replacement when both PVCs are visible to the cached client. No duplicate Create. 3. Pending-deletion with replacement visible only via DirectClient (asymmetric) — simulates the cache-lag window where Create succeeded server-side but the watch event hasn't propagated. The cached Client sees only the old PVC; the idempotency check uses DirectClient and detects the replacement. This case is the actual HIGH-3 regression guard: removing the DirectClient lookup makes it fail. The test setup now wires a separate mockDirectClient so cached and uncached views can diverge per-test-case. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/resources/kafka/kafka_test.go | 82 +++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 2ef2d2176..ae0bd31bc 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1403,8 +1403,13 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } testCases := []struct { - testName string - existingPvc *corev1.PersistentVolumeClaim + testName string + existingPvc *corev1.PersistentVolumeClaim + additionalExistingPvc *corev1.PersistentVolumeClaim // optional second PVC at the same mountPath (e.g., pre-staged replacement) + // directClientOnlyPvc: when set, this PVC is returned only by DirectClient (uncached) reads, + // not by the cached Client. Simulates the cache-lag window after a Create where the watch + // event hasn't propagated yet — the scenario the HIGH-3 idempotency fix protects against. + directClientOnlyPvc *corev1.PersistentVolumeClaim desiredPvc *corev1.PersistentVolumeClaim existingPods []corev1.Pod initialCacheVolumeState v1beta1.CacheResizeState // pre-existing brokerState for mountPath, if any @@ -1486,6 +1491,54 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { expectedDeletePvc: false, expectedError: false, }, + { + // Crash-recovery: state was written in a prior cycle but the replacement Create + // failed before completing. Pod is still running, so the old PVC must NOT be + // deleted and state must NOT be cleared. The reconciler must re-create the + // replacement. + testName: "pending-deletion with running pod and no replacement — replacement re-staged", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Idempotency (symmetric): pending-deletion state and a replacement at desired + // size already exists in both cached and uncached views. The inner-match loop + // finds the replacement; no second Create. + testName: "pending-deletion with running pod and replacement present — no duplicate Create", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + additionalExistingPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: false, + expectedDeletePvc: false, + expectedError: false, + }, + { + // Idempotency (asymmetric — the actual HIGH-3 cache-lag scenario): the prior + // Create succeeded server-side, so the replacement exists in etcd, but the cached + // Client hasn't yet observed the watch event. The inner match loop sees only the + // old PVC (size-filtered out) and would fall through to a second Create — except + // the idempotency check uses DirectClient (uncached) and detects the replacement. + // If a future refactor removes the DirectClient lookup, this test fails. + testName: "pending-deletion with replacement visible only via DirectClient — no duplicate Create", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + directClientOnlyPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: false, + expectedDeletePvc: false, + expectedError: false, + }, } mockCtrl := gomock.NewController(t) @@ -1494,6 +1547,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { test := test t.Run(test.testName, func(t *testing.T) { mockClient := mocks.NewMockClient(mockCtrl) + mockDirectClient := mocks.NewMockClient(mockCtrl) // separate so tests can simulate cache vs apiserver asymmetry mockSubResourceClient := mocks.NewMockSubResourceClient(mockCtrl) kafkaCluster := &v1beta1.KafkaCluster{ @@ -1517,11 +1571,14 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { r := Reconciler{ Reconciler: resources.Reconciler{ Client: mockClient, + DirectClient: mockDirectClient, KafkaCluster: kafkaCluster, }, } - // Mock PVC list — always returns the existing PVC + // Cached Client PVC list: returns the existing PVC plus any additional. Does NOT + // include directClientOnlyPvc — that simulates the cache-lag window where a Create + // has succeeded server-side but the watch event hasn't propagated. mockClient.EXPECT().List( context.TODO(), gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaimList{}), @@ -1529,6 +1586,25 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { gomock.Any(), ).Do(func(ctx context.Context, list *corev1.PersistentVolumeClaimList, opts ...client.ListOption) { list.Items = []corev1.PersistentVolumeClaim{*test.existingPvc} + if test.additionalExistingPvc != nil { + list.Items = append(list.Items, *test.additionalExistingPvc) + } + }).Return(nil).AnyTimes() + + // Uncached DirectClient PVC list: same as cached, plus directClientOnlyPvc when set. + mockDirectClient.EXPECT().List( + context.TODO(), + gomock.AssignableToTypeOf(&corev1.PersistentVolumeClaimList{}), + client.InNamespace(namespace), + gomock.Any(), + ).Do(func(ctx context.Context, list *corev1.PersistentVolumeClaimList, opts ...client.ListOption) { + list.Items = []corev1.PersistentVolumeClaim{*test.existingPvc} + if test.additionalExistingPvc != nil { + list.Items = append(list.Items, *test.additionalExistingPvc) + } + if test.directClientOnlyPvc != nil { + list.Items = append(list.Items, *test.directClientOnlyPvc) + } }).Return(nil).AnyTimes() // Mock Pod list — returns the configured pods From 963784188c93679f29915bbd02573a8f15bd7a8b Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 11:57:15 +0300 Subject: [PATCH 14/21] e2e tests --- ...r_tiered_storage_cache_resize_initial.yaml | 31 +++++++++++++++++- ...er_tiered_storage_cache_resize_shrunk.yaml | 31 +++++++++++++++++- tests/e2e/test_tiered_storage_cache_resize.go | 32 +++++++++++-------- 3 files changed, 79 insertions(+), 15 deletions(-) diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml index b4df5bbd3..39ec42a3f 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_initial.yaml @@ -10,9 +10,15 @@ spec: clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" readOnlyConfig: | auto.create.topics.enable=false + cruise.control.metrics.topic.auto.create=true + cruise.control.metrics.topic.num.partitions=1 + cruise.control.metrics.topic.replication.factor=1 rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: {} + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 brokers: - id: 0 brokerConfig: @@ -42,7 +48,30 @@ spec: requests: cpu: "200m" memory: 1Gi + # Second broker: present only to satisfy the CRD's minimum replicationFactor=2 constraint + # for the Cruise Control coordination topic. Carries no tiered storage cache PVC and is + # not the broker under test (tsResizeBrokerID=0). - id: 1 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 2 brokerConfig: processRoles: - controller diff --git a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml index 5d158905a..16b966a3d 100644 --- a/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml +++ b/config/samples/kafkacluster_tiered_storage_cache_resize_shrunk.yaml @@ -10,9 +10,15 @@ spec: clusterImage: "ghcr.io/adobe/koperator/kafka:2.13-3.9.1" readOnlyConfig: | auto.create.topics.enable=false + cruise.control.metrics.topic.auto.create=true + cruise.control.metrics.topic.num.partitions=1 + cruise.control.metrics.topic.replication.factor=1 rollingUpgradeConfig: failureThreshold: 1 - cruiseControlConfig: {} + cruiseControlConfig: + topicConfig: + partitions: 1 + replicationFactor: 2 brokers: - id: 0 brokerConfig: @@ -42,7 +48,30 @@ spec: requests: cpu: "200m" memory: 1Gi + # Second broker: present only to satisfy the CRD's minimum replicationFactor=2 constraint + # for the Cruise Control coordination topic. Carries no tiered storage cache PVC and is + # not the broker under test (tsResizeBrokerID=0). - id: 1 + brokerConfig: + processRoles: + - broker + terminationGracePeriodSeconds: 10 + storageConfigs: + - mountPath: "/kafka-logs" + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + resourceRequirements: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "200m" + memory: 1Gi + - id: 2 brokerConfig: processRoles: - controller diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 0256775f8..5e6e1cfb3 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -233,10 +233,10 @@ func testTieredStorageCachePvcResize() bool { applyK8sResourceManifest(kubectlOptions, tsResizeInitialManifest) ginkgo.By("Waiting for all broker pods to be ready") - // kubectl wait --for=condition=Ready fails immediately when no pods exist yet. - // We don't wait for ClusterRunning — the cluster may stay in ClusterReconciling - // because GracefulDiskRebalanceRequired for log volumes needs CC, which this minimal - // test cluster does not deploy. Pod readiness is sufficient to proceed with the resize. + // kubectl wait --for=condition=Ready fails immediately when no pods exist yet, so + // poll the pod list explicitly. Pod readiness is sufficient to proceed with the + // resize — we don't gate on ClusterRunning because Cruise Control may still be + // completing the post-startup rebalance when the resize is initiated. gomega.Eventually(context.Background(), func() error { output, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, "get", "pod", @@ -295,26 +295,32 @@ func testTieredStorageCachePvcResize() bool { }) ginkgo.It("Phase 1: resize state recorded in brokerState and replacement PVC created", func() { - ginkgo.By("Waiting until CacheVolumeStates has pending-deletion and two PVCs coexist for broker 0") + // Two acceptable success paths — the resize can complete in seconds, often faster + // than the polling interval, so insisting on the intermediate "pending-deletion" + // snapshot makes this phase flake-prone: + // A) In-progress: cacheVolumeStates[mp]=="pending-deletion" AND ≥2 cache PVCs. + // B) Already completed: state cleared AND exactly one cache PVC at the shrunk size. + // Either proves the resize was staged correctly; Phase 2/3 cover the remaining + // invariants regardless of which path we observed. + ginkgo.By("Waiting until the resize is observable as in-progress or completed") gomega.Eventually(context.Background(), func() error { state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) if err != nil { return err } - if state != "pending-deletion" { - return fmt.Errorf("expected cacheVolumeStates[%s]=%q, got %q", - tsResizeCacheMountPath, "pending-deletion", state) - } pvcs, err := listBrokerCachePVCs(kubectlOptions) if err != nil { return err } - if len(pvcs) < 2 { - return fmt.Errorf("expected 2 cache PVCs (old + replacement) for broker %d, got %d", - tsResizeBrokerID, len(pvcs)) + if state == "pending-deletion" && len(pvcs) >= 2 { + return nil } - return nil + if state == "" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { + return nil + } + return fmt.Errorf("not at expected staging state: cacheVolumeStates[%s]=%q, %d cache PVC(s)", + tsResizeCacheMountPath, state, len(pvcs)) }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) From 8a85e1550afe875bc4ffaa9ecc4825a8dcf56bcd Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Thu, 7 May 2026 16:51:54 +0300 Subject: [PATCH 15/21] agent review --- api/v1beta1/common_types.go | 20 +- api/v1beta1/zz_generated.deepcopy.go | 6 +- charts/kafka-operator/crds/kafkaclusters.yaml | 21 +- .../kafka.banzaicloud.io_kafkaclusters.yaml | 21 +- docs/tiered-storage-pvc-resize.md | 31 +-- pkg/k8sutil/status.go | 10 +- pkg/resources/kafka/kafka.go | 141 ++++++++++---- pkg/resources/kafka/kafka_test.go | 20 +- pkg/webhooks/kafkacluster_validator.go | 92 +++++---- pkg/webhooks/kafkacluster_validator_test.go | 180 ++++++++++-------- tests/e2e/test_tiered_storage_cache_resize.go | 27 +++ 11 files changed, 355 insertions(+), 214 deletions(-) diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index c7466a66f..57edfaf52 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -216,14 +216,17 @@ type VolumeState struct { CruiseControlOperationReference *corev1.LocalObjectReference `json:"cruiseControlOperationReference,omitempty"` } -// CacheResizeState tracks the resize lifecycle of a tiered storage cache PVC for a given mount path. -type CacheResizeState string +// TieredCacheVolumeState tracks the lifecycle state of a tiered storage cache PVC for a given mount path. +type TieredCacheVolumeState string const ( - // CacheResizePendingDeletion indicates that the old cache PVC at this mount path is waiting + // TieredCacheVolumeActive indicates the mount path is an active tiered storage cache volume + // with no resize operation in progress. + TieredCacheVolumeActive TieredCacheVolumeState = "active" + // TieredCacheVolumePendingDeletion indicates that the old cache PVC at this mount path is waiting // to be deleted once the broker pod stops. A replacement PVC with the new desired size has // already been created at the same mount path. - CacheResizePendingDeletion CacheResizeState = "pending-deletion" + TieredCacheVolumePendingDeletion TieredCacheVolumeState = "pending-deletion" ) // BrokerState holds information about broker state @@ -244,9 +247,12 @@ type BrokerState struct { Image string `json:"image,omitempty"` // Compressed data from broker configuration to restore broker pod in specific cases ConfigurationBackup string `json:"configurationBackup,omitempty"` - // CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. - // An entry is present only while a resize is in progress; it is cleared once cleanup completes. - CacheVolumeStates map[string]CacheResizeState `json:"cacheVolumeStates,omitempty"` + // TieredCacheVolumes tracks tiered storage cache PVC state for this broker, keyed by mount path. + // "active" means the PVC is a cache volume with no resize in progress. + // "pending-deletion" means a resize is in flight (old PVC waiting for pod to stop). + // Absent entry means the mount path is not a cache volume (or the PVC was removed). + // Written at PVC creation and updated through the resize lifecycle. + TieredCacheVolumes map[string]TieredCacheVolumeState `json:"tieredCacheVolumes,omitempty"` } const ( diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 2829a1d40..d0301a131 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -209,9 +209,9 @@ func (in *BrokerState) DeepCopyInto(out *BrokerState) { *out = make(ExternalListenerConfigNames, len(*in)) copy(*out, *in) } - if in.CacheVolumeStates != nil { - in, out := &in.CacheVolumeStates, &out.CacheVolumeStates - *out = make(map[string]CacheResizeState, len(*in)) + if in.TieredCacheVolumes != nil { + in, out := &in.TieredCacheVolumes, &out.TieredCacheVolumes + *out = make(map[string]TieredCacheVolumeState, len(*in)) for key, val := range *in { (*out)[key] = val } diff --git a/charts/kafka-operator/crds/kafkaclusters.yaml b/charts/kafka-operator/crds/kafkaclusters.yaml index f1859b527..5ad400378 100644 --- a/charts/kafka-operator/crds/kafkaclusters.yaml +++ b/charts/kafka-operator/crds/kafkaclusters.yaml @@ -23092,15 +23092,6 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: - cacheVolumeStates: - additionalProperties: - description: CacheResizeState tracks the resize lifecycle - of a tiered storage cache PVC for a given mount path. - type: string - description: |- - CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. - An entry is present only while a resize is in progress; it is cleared once cleanup completes. - type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases @@ -23181,6 +23172,18 @@ spec: description: RackAwarenessState holds info about rack awareness status type: string + tieredCacheVolumes: + additionalProperties: + description: TieredCacheVolumeState tracks the lifecycle state + of a tiered storage cache PVC for a given mount path. + type: string + description: |- + TieredCacheVolumes tracks tiered storage cache PVC state for this broker, keyed by mount path. + "active" means the PVC is a cache volume with no resize in progress. + "pending-deletion" means a resize is in flight (old PVC waiting for pod to stop). + Absent entry means the mount path is not a cache volume (or the PVC was removed). + Written at PVC creation and updated through the resize lifecycle. + type: object version: description: Version holds the current version of the broker in semver format diff --git a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml index f1859b527..5ad400378 100644 --- a/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml +++ b/config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml @@ -23092,15 +23092,6 @@ spec: additionalProperties: description: BrokerState holds information about broker state properties: - cacheVolumeStates: - additionalProperties: - description: CacheResizeState tracks the resize lifecycle - of a tiered storage cache PVC for a given mount path. - type: string - description: |- - CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path. - An entry is present only while a resize is in progress; it is cleared once cleanup completes. - type: object configurationBackup: description: Compressed data from broker configuration to restore broker pod in specific cases @@ -23181,6 +23172,18 @@ spec: description: RackAwarenessState holds info about rack awareness status type: string + tieredCacheVolumes: + additionalProperties: + description: TieredCacheVolumeState tracks the lifecycle state + of a tiered storage cache PVC for a given mount path. + type: string + description: |- + TieredCacheVolumes tracks tiered storage cache PVC state for this broker, keyed by mount path. + "active" means the PVC is a cache volume with no resize in progress. + "pending-deletion" means a resize is in flight (old PVC waiting for pod to stop). + Absent entry means the mount path is not a cache volume (or the PVC was removed). + Written at PVC creation and updated through the resize lifecycle. + type: object version: description: Version holds the current version of the broker in semver format diff --git a/docs/tiered-storage-pvc-resize.md b/docs/tiered-storage-pvc-resize.md index dd21d7a1d..59d5e50b2 100644 --- a/docs/tiered-storage-pvc-resize.md +++ b/docs/tiered-storage-pvc-resize.md @@ -10,16 +10,19 @@ with the rolling upgrade machinery so only one broker is affected at a time. ## State tracking Resize state is stored in the `KafkaCluster` CR status under -`status.brokersState[].cacheVolumeStates`, keyed by mount path. +`status.brokersState[].tieredCacheVolumes`, keyed by mount path. This keeps the KafkaCluster CR the single source of truth for all in-flight broker operations and avoids a second, parallel state store on PVC objects. | Field | Value | Meaning | |-------|-------|---------| -| `status.brokersState[N].cacheVolumeStates[]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. | +| `status.brokersState[N].tieredCacheVolumes[]` | `active` | The mount path is a tiered storage cache volume. No resize is in progress. | +| `status.brokersState[N].tieredCacheVolumes[]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. | +| *(absent)* | — | The mount path is not a tiered storage cache volume (or the PVC has been removed). | -The entry is cleared once the old PVC has been deleted and the broker pod has -restarted. An empty map means no resize is in progress. +The entry transitions from `pending-deletion` back to `active` once the old PVC +has been deleted and the resize is complete. An absent entry means no cache PVC +exists at that path. Two PVC annotations that describe what a PVC **is** (not operational state) are always present on cache PVCs: @@ -35,7 +38,7 @@ always present on cache PVCs: ### Cycle N — resize detected, pod running -1. `status.brokersState[N].cacheVolumeStates[]` is set to `pending-deletion` +1. `status.brokersState[N].tieredCacheVolumes[]` is set to `pending-deletion` in the KafkaCluster CR status. This is the durable record that a resize is in flight. 2. A replacement PVC with the new (smaller) size is created. Provisioning starts immediately. 3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart @@ -53,14 +56,14 @@ to fully disappear from etcd. 1. The old PVC (the one whose size differs from the desired size at that mount path) is deleted. -2. The `cacheVolumeStates` entry for that mount path is cleared from the CR status. +2. The `tieredCacheVolumes` entry for that mount path is set to `active` in the CR status. 3. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency. ### Cycle N+2 — pod is present again -1. No `cacheVolumeStates` entry remains for the mount path → resize is complete. -2. The replacement PVC is now an ordinary cache PVC with no special state attached. +1. No `pending-deletion` entry remains for the mount path → resize is complete. +2. The replacement PVC is now an ordinary cache PVC with `tieredCacheVolumes[] = active`. --- @@ -69,7 +72,7 @@ to fully disappear from etcd. A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No -`cacheVolumeStates` entry is written and no rolling restart is triggered. +`tieredCacheVolumes` state change is made and no rolling restart is triggered. A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log @@ -81,11 +84,11 @@ volumes reject any size decrease with an error. | Property | Value | |----------|-------| -| State survives reconciler crash | Yes — `cacheVolumeStates` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant | +| State survives reconciler crash | Yes — `tieredCacheVolumes` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant | | Single source of truth | Yes — all broker state (configuration, graceful actions, cache resize) lives in `status.brokersState` | | Atomicity gap | Eliminated — replacement PVC is created before old is deleted | | Provisioning overlaps gate evaluation | Yes — replacement PVC created in cycle N, not N+1 | -| Observable via kubectl | Yes — `kubectl get kafkacluster -o jsonpath='{.status.brokersState}'` shows resize state; an empty `cacheVolumeStates` means no resize is in progress | +| Observable via kubectl | Yes — `kubectl get kafkacluster -o jsonpath='{.status.brokersState}'` shows resize state; `pending-deletion` entries indicate an in-flight resize | | CC disk rebalance for cache PVCs | Excluded — tiered cache PVCs are explicitly skipped from `GracefulDiskRebalanceRequired` and CC capacity config | | `log.dirs` for cache PVCs | Excluded — `generateStorageConfig` skips volumes with `TieredStorageCache: true` | @@ -95,7 +98,7 @@ volumes reject any size decrease with an error. ``` Cycle N (pod UP, resize detected) - ├─ set cacheVolumeStates[mountPath] = pending-deletion in CR status + ├─ set tieredCacheVolumes[mountPath] = pending-deletion in CR status ├─ create replacement PVC (provisioning starts) ├─ set ConfigOutOfSync └─ handleRollingUpgrade @@ -107,9 +110,9 @@ Cycle N+k (pod UP, gates failing — any number of cycles) Cycle N+k+1 (pod ABSENT — gone or Terminating) ├─ delete old PVC (identified as the PVC at mountPath whose size ≠ desired) - ├─ clear cacheVolumeStates[mountPath] from CR status + ├─ set tieredCacheVolumes[mountPath] = active in CR status └─ create new pod bound to replacement PVC Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running) - └─ cacheVolumeStates entry is absent → resize complete, no further action + └─ tieredCacheVolumes[mountPath] = active → resize complete, no further action ``` diff --git a/pkg/k8sutil/status.go b/pkg/k8sutil/status.go index e48a305ae..f287727ed 100644 --- a/pkg/k8sutil/status.go +++ b/pkg/k8sutil/status.go @@ -174,15 +174,15 @@ func generateBrokerState(brokerIDs []string, cluster *banzaicloudv1beta1.KafkaCl case banzaicloudv1beta1.KafkaVersion: brokerState.Image = s.Image brokerState.Version = s.Version - case map[string]banzaicloudv1beta1.CacheResizeState: - if brokerState.CacheVolumeStates == nil { - brokerState.CacheVolumeStates = make(map[string]banzaicloudv1beta1.CacheResizeState) + case map[string]banzaicloudv1beta1.TieredCacheVolumeState: + if brokerState.TieredCacheVolumes == nil { + brokerState.TieredCacheVolumes = make(map[string]banzaicloudv1beta1.TieredCacheVolumeState) } for mountPath, state := range s { if state == "" { - delete(brokerState.CacheVolumeStates, mountPath) + delete(brokerState.TieredCacheVolumes, mountPath) } else { - brokerState.CacheVolumeStates[mountPath] = state + brokerState.TieredCacheVolumes[mountPath] = state } } } diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index a02288e63..663852ef9 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -474,8 +474,8 @@ func (r *Reconciler) Reconcile(log logr.Logger) error { pendingDeletionMountPaths := make(map[string]bool) brokerIdStr := strconv.Itoa(int(broker.Id)) - for mp, state := range r.KafkaCluster.Status.BrokersState[brokerIdStr].CacheVolumeStates { - if state == banzaiv1beta1.CacheResizePendingDeletion { + for mp, state := range r.KafkaCluster.Status.BrokersState[brokerIdStr].TieredCacheVolumes { + if state == banzaiv1beta1.TieredCacheVolumePendingDeletion { pendingDeletionMountPaths[mp] = true } } @@ -1288,8 +1288,20 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro return nil } -// deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired. -// These volumes are not Kafka log dirs and must never go through CC disk removal. +// isBrokerPVCTieredCache reports whether the given PVC is a tiered storage cache volume, +// using status.BrokersState as the authoritative source of truth (written at PVC creation). +// Falls back to the PVC annotation for PVCs created before TieredCacheVolumes was introduced. +func (r *Reconciler) isBrokerPVCTieredCache(pvc *corev1.PersistentVolumeClaim, brokerId string) bool { + mountPath := pvc.Annotations["mountPath"] + if state := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath]; state != "" { + return true + } + return pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue +} + +// deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired +// and clears their status entries. These volumes are not Kafka log dirs and must never go through +// CC disk removal. func (r *Reconciler) deleteRemovedCachePVCs( ctx context.Context, log logr.Logger, @@ -1297,19 +1309,54 @@ func (r *Reconciler) deleteRemovedCachePVCs( pvcList *corev1.PersistentVolumeClaimList, desiredMountPaths map[string]bool, ) error { + // pathsToUntrack accumulates mount paths whose status entry must be cleared. + // Collected in one pass so a single UpdateBrokerStatus call covers all of them. + pathsToUntrack := make(map[string]banzaiv1beta1.TieredCacheVolumeState) + for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.DeletionTimestamp != nil || pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { + if !r.isBrokerPVCTieredCache(pvc, brokerId) { continue } - if !desiredMountPaths[pvc.Annotations["mountPath"]] { + mountPath := pvc.Annotations["mountPath"] + if desiredMountPaths[mountPath] { + continue + } + // Issue Delete only if the PVC is not already terminating. + if pvc.DeletionTimestamp == nil { log.Info("Deleting removed tiered storage cache PVC", - "brokerId", brokerId, "mountPath", pvc.Annotations["mountPath"], "pvc", pvc.Name) + "brokerId", brokerId, "mountPath", mountPath, "pvc", pvc.Name) if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { return errorfactory.New(errorfactory.APIFailure{}, err, "deleting removed tiered storage cache PVC failed", "pvc", pvc.Name) } } + // Clear status regardless of PVC termination state: once deletion has been + // requested (DeletionTimestamp set), the status entry is no longer needed. + pathsToUntrack[mountPath] = "" + } + + // Second pass: clear status entries for mount paths that are no longer desired + // but whose PVC is already fully finalized and absent from the list. This handles + // the retry path where a prior cycle deleted the PVC but failed to clear status. + for mountPath, state := range r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes { + if state == "" || desiredMountPaths[mountPath] { + continue + } + if _, alreadyHandled := pathsToUntrack[mountPath]; !alreadyHandled { + log.Info("Clearing stale tiered cache volume status for already-deleted mount path", + "brokerId", brokerId, "mountPath", mountPath) + pathsToUntrack[mountPath] = "" + } + } + + if len(pathsToUntrack) == 0 { + return nil + } + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + pathsToUntrack, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not clear tiered cache volume state after PVC deletion", "brokerId", brokerId) } return nil } @@ -1317,12 +1364,12 @@ func (r *Reconciler) deleteRemovedCachePVCs( // effectivePvcCount returns the number of PVCs for a broker, counting each pending-deletion cache // mount path only once (old + replacement coexist during a shrink but represent one logical disk). func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.PersistentVolumeClaimList) int { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates + tieredCacheVolumes := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes counted := make(map[string]bool) total := 0 for _, pvc := range pvcList.Items { mp := pvc.Annotations["mountPath"] - if cacheVolumeStates[mp] == banzaiv1beta1.CacheResizePendingDeletion { + if tieredCacheVolumes[mp] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if !counted[mp] { counted[mp] = true total++ @@ -1358,7 +1405,7 @@ func (r *Reconciler) getBrokerPodExists(ctx context.Context, brokerId string) (b // It returns true when the caller should skip the rest of the reconcile loop for this broker (orphaned resize // state detected while the pod is running — a rolling restart must complete first). // -//nolint:funlen +//nolint:funlen,gocyclo func (r *Reconciler) handleBrokerCacheResizeCleanup( ctx context.Context, log logr.Logger, @@ -1369,8 +1416,15 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( brokerPodExists bool, matchingLabels client.MatchingLabels, ) (bool, error) { - cacheVolumeStates := r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates - if len(cacheVolumeStates) == 0 { + tieredCacheVolumes := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes + hasPendingDeletion := false + for _, state := range tieredCacheVolumes { + if state == banzaiv1beta1.TieredCacheVolumePendingDeletion { + hasPendingDeletion = true + break + } + } + if !hasPendingDeletion { return false, nil } @@ -1384,9 +1438,9 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( } } - // stateUpdates accumulates CacheVolumeStates changes to apply in a single status update. - // An empty CacheResizeState value means "delete this entry from the map". - stateUpdates := make(map[string]banzaiv1beta1.CacheResizeState) + // stateUpdates accumulates TieredCacheVolumes changes to apply in a single status update. + // An empty TieredCacheVolumeState value means "delete this entry from the map". + stateUpdates := make(map[string]banzaiv1beta1.TieredCacheVolumeState) if brokerPodExists { // Resize complete when the replacement PVC (size == desired) already exists at the mount @@ -1395,8 +1449,8 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // pending-deletion state and creating the replacement PVC, the next cycle sees only the // old PVC (size != desired) and correctly does NOT clear state, letting // reconcileDesiredPvcsForBroker re-create the replacement. - for mp, state := range cacheVolumeStates { - if state != banzaiv1beta1.CacheResizePendingDeletion { + for mp, state := range tieredCacheVolumes { + if state != banzaiv1beta1.TieredCacheVolumePendingDeletion { continue } desiredSize := desiredSizeByMountPath[mp] @@ -1415,15 +1469,15 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( } } if replacementExists && !oldPvcExists { - stateUpdates[mp] = "" + stateUpdates[mp] = banzaiv1beta1.TieredCacheVolumeActive log.Info("Tiered storage cache PVC resize complete — clearing resize state", "brokerId", brokerId, "mountPath", mp) } } } else { // Broker pod is down — safe to delete old PVCs and clean up orphaned state. - for mp, state := range cacheVolumeStates { - if state != banzaiv1beta1.CacheResizePendingDeletion { + for mp, state := range tieredCacheVolumes { + if state != banzaiv1beta1.TieredCacheVolumePendingDeletion { continue } if !desiredMountPaths[mp] { @@ -1440,6 +1494,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( "deleting orphaned cache resize PVC failed", "pvc", pvc.Name) } } + stateUpdates[mp] = "" // entry removed: volume gone, no longer a cache PVC } else { // Mount path still desired: delete the old PVC (the one whose size differs from desired). desiredSize := desiredSizeByMountPath[mp] @@ -1458,8 +1513,8 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( break } } + stateUpdates[mp] = banzaiv1beta1.TieredCacheVolumeActive // resize done, still a cache PVC } - stateUpdates[mp] = "" } if len(stateUpdates) > 0 { @@ -1487,7 +1542,10 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // longer desired (storage config removed mid-resize), trigger a rolling restart so the pod // stops and the orphaned PVCs can be deleted on the next cycle. if brokerPodExists { - for mp := range r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates { + for mp, state := range r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes { + if state != banzaiv1beta1.TieredCacheVolumePendingDeletion { + continue + } if !desiredMountPaths[mp] { log.Info("Orphaned cache resize state detected with broker pod running — triggering rolling restart", "brokerId", brokerId, "mountPath", mp) @@ -1508,7 +1566,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // reconcileDesiredPvcsForBroker reconciles the desired PVCs for a single broker. // -//nolint:funlen +//nolint:funlen,gocyclo func (r *Reconciler) reconcileDesiredPvcsForBroker( ctx context.Context, log logr.Logger, @@ -1537,6 +1595,13 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } log.Info("resource created") + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not record tiered cache volume state after PVC creation", "brokerId", brokerId) + } + } continue } @@ -1554,7 +1619,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Skip the old PVC that is waiting to be deleted once the broker pod stops. // We identify it as the PVC whose size differs from the desired size while a // pending-deletion resize is in flight for this mount path. - if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if desiredPvc.Spec.Resources.Requests.Storage().Value() != pvc.Spec.Resources.Requests.Storage().Value() { continue } @@ -1564,7 +1629,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Trigger a CC disk rebalance only for regular data volumes. // Tiered storage cache PVCs are ephemeral — CC must not account for them. - if currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] != annotationTrue { + if !r.isBrokerPVCTieredCache(currentPvc, brokerId) { // Checking pvc state, if bounded, so the broker has already restarted and the CC GracefulDiskRebalance has not happened yet, // then we make it happening with status update. // If disk removal was set, and the disk was added back, we also need to mark the volume for rebalance @@ -1584,7 +1649,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // excludes the old PVC, so alreadyCreated is false even when a replacement exists. // A strongly-consistent (uncached) read picks up that replacement so we don't issue a // second Create with a fresh GenerateName. - if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { liveList := &corev1.PersistentVolumeClaimList{} if err := r.DirectClient.List(ctx, liveList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "uncached list of PVCs failed", "kind", desiredType) @@ -1613,13 +1678,20 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if err := r.Create(ctx, desiredPvc); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, + map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { + return errorfactory.New(errorfactory.StatusUpdateError{}, err, + "could not record tiered cache volume state after PVC creation", "brokerId", brokerId) + } + } continue } // A resize is in flight for this mount path: the matched PVC is the replacement. // Keep triggering ConfigOutOfSync so the rolling upgrade completes and the old PVC // can be deleted on the next reconcile cycle. - if r.KafkaCluster.Status.BrokersState[brokerId].CacheVolumeStates[mountPath] == banzaiv1beta1.CacheResizePendingDeletion { + if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if r.KafkaCluster.Status.BrokersState[brokerId].ConfigurationState != banzaiv1beta1.ConfigOutOfSync { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, banzaiv1beta1.ConfigOutOfSync, log); err != nil { @@ -1630,11 +1702,11 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( continue } - // Trust only the persisted PVC annotation when classifying — never the desired-spec - // annotation. Reading from the desired side would let a CR-edit (flipping - // tieredStorageCache: false → true on an existing data volume) route a real log-dir PVC - // through the cache-shrink delete-and-recreate path, bypassing graceful disk drain. - isTieredCache := currentPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue + // Use the committed brokerState (written at PVC creation) as the authoritative source of + // truth — never the desired spec. Reading from the desired side would let a CR edit + // (flipping tieredStorageCache: false → true on an existing data volume) route a real + // log-dir PVC through the cache-shrink delete-and-recreate path, bypassing graceful drain. + isTieredCache := r.isBrokerPVCTieredCache(currentPvc, brokerId) desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() @@ -1697,7 +1769,7 @@ func (r *Reconciler) stageTieredCachePVCShrink( // Record the resize in brokerState so the reconciler knows to exclude the old PVC and // keep the rolling upgrade running until the old PVC is cleaned up. if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, - map[string]banzaiv1beta1.CacheResizeState{mountPath: banzaiv1beta1.CacheResizePendingDeletion}, log); err != nil { + map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumePendingDeletion}, log); err != nil { return errorfactory.New(errorfactory.StatusUpdateError{}, err, "could not record cache resize state for tiered storage cache PVC resize", "brokerId", brokerId) } @@ -1709,6 +1781,9 @@ func (r *Reconciler) stageTieredCachePVCShrink( return errors.WrapIf(err, "could not apply last state to annotation") } if err := r.Create(ctx, desiredPvc); err != nil { + // Resize state is already recorded; the next reconcile cycle will re-create the replacement. + log.Error(err, "creating replacement tiered storage cache PVC failed after recording resize state; will retry", + "brokerId", brokerId, "mountPath", mountPath) return errorfactory.New(errorfactory.APIFailure{}, err, "creating replacement tiered storage cache PVC failed", "brokerId", brokerId) } diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index ae0bd31bc..176b80e59 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1412,14 +1412,14 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { directClientOnlyPvc *corev1.PersistentVolumeClaim desiredPvc *corev1.PersistentVolumeClaim existingPods []corev1.Pod - initialCacheVolumeState v1beta1.CacheResizeState // pre-existing brokerState for mountPath, if any + initialTieredCacheState v1beta1.TieredCacheVolumeState // pre-existing brokerState for mountPath, if any expectedUpdatePvc bool expectedCreatePvc bool expectedDeletePvc bool expectedError bool }{ { - // Pod is up, no prior resize state: record CacheResizePendingDeletion in brokerState, + // Pod is up, no prior resize state: record TieredCacheVolumePendingDeletion in brokerState, // create replacement PVC, set ConfigOutOfSync to trigger rolling upgrade. testName: "size decrease with running pod — resize state recorded, replacement PVC created", existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), @@ -1449,7 +1449,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{terminatingPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: true, @@ -1474,7 +1474,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: true, @@ -1500,7 +1500,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), existingPods: []corev1.Pod{runningPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: true, expectedDeletePvc: false, @@ -1515,7 +1515,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { additionalExistingPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), existingPods: []corev1.Pod{runningPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: false, expectedDeletePvc: false, @@ -1533,7 +1533,7 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { directClientOnlyPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), desiredPvc: makeTieredCachePvc("cache-pvc-replacement", "50Gi"), existingPods: []corev1.Pod{runningPod}, - initialCacheVolumeState: v1beta1.CacheResizePendingDeletion, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, expectedUpdatePvc: false, expectedCreatePvc: false, expectedDeletePvc: false, @@ -1559,11 +1559,11 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { }}, }, } - if test.initialCacheVolumeState != "" { + if test.initialTieredCacheState != "" { kafkaCluster.Status.BrokersState = map[string]v1beta1.BrokerState{ brokerId: { - CacheVolumeStates: map[string]v1beta1.CacheResizeState{ - mountPath: test.initialCacheVolumeState, + TieredCacheVolumes: map[string]v1beta1.TieredCacheVolumeState{ + mountPath: test.initialTieredCacheState, }, }, } diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index 251e5a0b0..5c7b9fbde 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -18,6 +18,7 @@ package webhooks import ( "context" "fmt" + "strconv" corev1 "k8s.io/api/core/v1" @@ -48,7 +49,7 @@ func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, oldObj, newOb allErrs = append(allErrs, listenerErrs...) } - allErrs = append(allErrs, checkTieredStorageCacheImmutability(&kafkaClusterOld.Spec, &kafkaClusterNew.Spec)...) + allErrs = append(allErrs, checkTieredStorageCacheImmutability(kafkaClusterOld, kafkaClusterNew)...) if len(allErrs) == 0 { return nil, nil @@ -83,59 +84,66 @@ func (s KafkaClusterValidator) ValidateDelete(_ context.Context, _ *banzaicloudv return nil, nil } -// checkTieredStorageCacheImmutability rejects updates that flip the TieredStorageCache flag on an -// existing storageConfigs entry (matched by mountPath, scoped per brokerConfigGroup or per broker). -// Flipping false→true on a live data PVC silently removes it from log.dirs and Cruise Control -// capacity, and enables a delete-and-recreate shrink path that bypasses graceful disk drain — an -// irreversible data-loss vector when applied to a volume that already holds Kafka log segments. -// To change this property, callers must remove the entry and re-add it in a subsequent apply. -func checkTieredStorageCacheImmutability(oldSpec, newSpec *banzaicloudv1beta1.KafkaClusterSpec) field.ErrorList { +// checkTieredStorageCacheImmutability rejects updates that change the tieredStorageCache +// classification of an existing PVC. It uses status.BrokersState[brokerID].TieredCacheVolumes +// as the authoritative source of truth — the reconciler writes this map when it creates each PVC. +// Comparing against committed status catches all bypass paths (in-place flip, group-switch, +// inline-shadow override) without requiring spec-level merging or effective-config resolution. +// Removing a mountPath from the spec is allowed (delete-and-re-add path). +func checkTieredStorageCacheImmutability(oldCluster, newCluster *banzaicloudv1beta1.KafkaCluster) field.ErrorList { var allErrs field.ErrorList - for groupName, newGroup := range newSpec.BrokerConfigGroups { - oldGroup, ok := oldSpec.BrokerConfigGroups[groupName] - if !ok { - continue - } - groupPath := field.NewPath("spec").Child("brokerConfigGroups").Key(groupName).Child("storageConfigs") - allErrs = append(allErrs, diffTieredStorageCache(oldGroup.StorageConfigs, newGroup.StorageConfigs, groupPath)...) - } - - oldByID := make(map[int32]banzaicloudv1beta1.Broker, len(oldSpec.Brokers)) - for _, b := range oldSpec.Brokers { - oldByID[b.Id] = b - } - for i, newBroker := range newSpec.Brokers { - oldBroker, ok := oldByID[newBroker.Id] - if !ok || oldBroker.BrokerConfig == nil || newBroker.BrokerConfig == nil { - continue + for brokerIDStr, brokerState := range oldCluster.Status.BrokersState { + for mountPath, state := range brokerState.TieredCacheVolumes { + if state == "" { + continue // not a cache PVC + } + newValue, fieldPath, found := findTieredStorageCacheInSpec(&newCluster.Spec, brokerIDStr, mountPath) + if !found { + continue // mountPath removed from spec — remove-and-re-add is allowed + } + if !newValue { + allErrs = append(allErrs, field.Forbidden(fieldPath, immutableTieredStorageCacheErrMsg)) + } } - brokerPath := field.NewPath("spec").Child("brokers").Index(i).Child("brokerConfig").Child("storageConfigs") - allErrs = append(allErrs, diffTieredStorageCache(oldBroker.BrokerConfig.StorageConfigs, newBroker.BrokerConfig.StorageConfigs, brokerPath)...) } return allErrs } -func diffTieredStorageCache(oldConfigs, newConfigs []banzaicloudv1beta1.StorageConfig, basePath *field.Path) field.ErrorList { - var allErrs field.ErrorList - oldByPath := make(map[string]banzaicloudv1beta1.StorageConfig, len(oldConfigs)) - for _, sc := range oldConfigs { - oldByPath[sc.MountPath] = sc - } - for i, newSC := range newConfigs { - oldSC, ok := oldByPath[newSC.MountPath] - if !ok { +// findTieredStorageCacheInSpec resolves the intended tieredStorageCache value for a given +// broker (by string ID, matching reconciler convention) and mountPath from the raw spec. +// Inline storageConfigs take priority over brokerConfigGroup (matching dedupStorageConfigs order). +// Returns (value, fieldPath, found=true) when the mountPath is present in the new spec. +func findTieredStorageCacheInSpec(spec *banzaicloudv1beta1.KafkaClusterSpec, brokerIDStr, mountPath string) (bool, *field.Path, bool) { + for i, broker := range spec.Brokers { + if strconv.Itoa(int(broker.Id)) != brokerIDStr { continue } - if oldSC.TieredStorageCache != newSC.TieredStorageCache { - allErrs = append(allErrs, field.Forbidden( - basePath.Index(i).Child("tieredStorageCache"), - immutableTieredStorageCacheErrMsg, - )) + // Inline config has priority (mirrors dedupStorageConfigs order in GetBrokerConfig) + if broker.BrokerConfig != nil { + for k, sc := range broker.BrokerConfig.StorageConfigs { + if sc.MountPath == mountPath { + p := field.NewPath("spec").Child("brokers").Index(i). + Child("brokerConfig").Child("storageConfigs").Index(k).Child("tieredStorageCache") + return sc.TieredStorageCache, p, true + } + } + } + if broker.BrokerConfigGroup != "" { + if group, ok := spec.BrokerConfigGroups[broker.BrokerConfigGroup]; ok { + for k, sc := range group.StorageConfigs { + if sc.MountPath == mountPath { + p := field.NewPath("spec").Child("brokerConfigGroups"). + Key(broker.BrokerConfigGroup).Child("storageConfigs").Index(k).Child("tieredStorageCache") + return sc.TieredStorageCache, p, true + } + } + } } + break } - return allErrs + return false, nil, false } // checkListeners validates the spec.listenersConfig object diff --git a/pkg/webhooks/kafkacluster_validator_test.go b/pkg/webhooks/kafkacluster_validator_test.go index 6193a9a8b..7d81acf69 100644 --- a/pkg/webhooks/kafkacluster_validator_test.go +++ b/pkg/webhooks/kafkacluster_validator_test.go @@ -259,53 +259,65 @@ func TestCheckExternalListenerStartingPort(t *testing.T) { } func TestCheckTieredStorageCacheImmutability(t *testing.T) { - mp := func(path string, tc bool) v1beta1.StorageConfig { + sc := func(path string, tc bool) v1beta1.StorageConfig { return v1beta1.StorageConfig{MountPath: path, TieredStorageCache: tc} } + // committed returns a KafkaCluster whose status records the given tieredCache volumes for broker 0. + // Only cache PVCs are tracked (TieredCacheVolumeActive or TieredCacheVolumePendingDeletion); + // non-cache PVCs have no entry. + committed := func(vols map[string]v1beta1.TieredCacheVolumeState) *v1beta1.KafkaCluster { + return &v1beta1.KafkaCluster{ + Status: v1beta1.KafkaClusterStatus{ + BrokersState: map[string]v1beta1.BrokerState{ + "0": {TieredCacheVolumes: vols}, + }, + }, + } + } testCases := []struct { - testName string - oldSpec v1beta1.KafkaClusterSpec - newSpec v1beta1.KafkaClusterSpec - expected field.ErrorList + testName string + oldCluster *v1beta1.KafkaCluster + newCluster *v1beta1.KafkaCluster + expected field.ErrorList }{ { - testName: "no change — empty", - oldSpec: v1beta1.KafkaClusterSpec{}, - newSpec: v1beta1.KafkaClusterSpec{}, + // No committed state — new cluster, any spec is valid. + testName: "no committed state — any spec change allowed", + oldCluster: &v1beta1.KafkaCluster{}, + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", true)}, + }}}, + }}, expected: nil, }, { - testName: "brokerConfigGroups: flip false→true on existing mountPath rejected", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}}, - }, - }, + // Committed status says /data IS a cache volume; inline spec now marks it as non-cache. + testName: "in-place inline flip true→false rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/data": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", false)}, + }}}, + }}, expected: append(field.ErrorList{}, field.Forbidden( - field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), immutableTieredStorageCacheErrMsg, ), ), }, { - testName: "brokerConfigGroups: flip true→false on existing mountPath rejected", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // Committed status says /cache IS a cache volume; group spec now marks it as non-cache. + testName: "in-place group flip true→false rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfigGroup: "default"}}, BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, + "default": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", false)}}, }, - }, + }}, expected: append(field.ErrorList{}, field.Forbidden( field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), @@ -314,48 +326,43 @@ func TestCheckTieredStorageCacheImmutability(t *testing.T) { ), }, { - testName: "brokerConfigGroups: new entry with tieredStorageCache=true allowed", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // Bypass attempt: broker switches from groupA (where /cache=true) to groupB (where /cache=false). + // The raw-spec old→new comparison would miss this since groupA is unchanged. + // Status-based check catches it because committed status records /cache=active for broker 0. + testName: "group-switch bypass rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfigGroup: "groupB"}}, BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{ - mp("/kafka-logs/0", false), - mp("/cache/0", true), - }}, + "groupA": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", true)}}, + "groupB": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", false)}}, }, - }, - expected: nil, + }}, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("groupB").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), }, { - testName: "brokerConfigGroups: remove entry, then re-add with different value — allowed (no entry to match against in old at the same mountPath after removal scenario; test single-pass: removed entry simply not in new)", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", false)}}, - }, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // Bypass attempt: /cache was provisioned as a cache volume (via group); now an inline entry + // overrides it with TieredStorageCache=false. Inline takes priority in GetBrokerConfig so + // the effective value would flip — the check must reject the inline entry. + testName: "inline-shadow bypass rejected", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{ + Id: 0, + BrokerConfigGroup: "default", + BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/cache", false)}, + }, + }}, BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "default": {StorageConfigs: []v1beta1.StorageConfig{}}, + "default": {StorageConfigs: []v1beta1.StorageConfig{sc("/cache", true)}}, }, - }, - expected: nil, - }, - { - testName: "brokers[].brokerConfig: flip false→true rejected", - oldSpec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ - StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", false)}, - }}}, - }, - newSpec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ - StorageConfigs: []v1beta1.StorageConfig{mp("/kafka-logs/0", true)}, - }}}, - }, + }}, expected: append(field.ErrorList{}, field.Forbidden( field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), @@ -364,34 +371,43 @@ func TestCheckTieredStorageCacheImmutability(t *testing.T) { ), }, { - testName: "brokers[].brokerConfig: nil old config — no error (entry didn't exist)", - oldSpec: v1beta1.KafkaClusterSpec{ - Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: nil}}, - }, - newSpec: v1beta1.KafkaClusterSpec{ + // mountPath is removed from spec entirely — remove-and-re-add path is intentionally allowed. + testName: "remove mountPath from spec — allowed", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ - StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}, + StorageConfigs: []v1beta1.StorageConfig{sc("/data", false)}, }}}, - }, + }}, expected: nil, }, { - testName: "brokerConfigGroup not present in old — no error (whole group is new)", - oldSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{}, - }, - newSpec: v1beta1.KafkaClusterSpec{ - BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ - "new-group": {StorageConfigs: []v1beta1.StorageConfig{mp("/cache/0", true)}}, - }, - }, + // New mountPath with no committed state — any value is allowed. + testName: "new mountPath not in committed state — allowed", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/data": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", true), sc("/cache", true)}, + }}}, + }}, + expected: nil, + }, + { + // Value unchanged — no error. + testName: "unchanged value — no error", + oldCluster: committed(map[string]v1beta1.TieredCacheVolumeState{"/cache": v1beta1.TieredCacheVolumeActive}), + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/cache", true)}, + }}}, + }}, expected: nil, }, } for _, testCase := range testCases { t.Run(testCase.testName, func(t *testing.T) { - got := checkTieredStorageCacheImmutability(&testCase.oldSpec, &testCase.newSpec) + got := checkTieredStorageCacheImmutability(testCase.oldCluster, testCase.newCluster) require.Equal(t, testCase.expected, got) }) } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 5e6e1cfb3..00eaae5f7 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -181,6 +181,33 @@ func testTieredStorageCachePvcResize() bool { var err error var broker0PodUID string // UID of the broker-0 pod before the resize, used to detect recycling + // AfterAll ensures the cluster and its PVCs are removed even when a test step fails + // mid-run, preventing stale resources from blocking the next test run. + ginkgo.AfterAll(func() { + opts, e := kubectlOptionsForCurrentContext() + if e != nil { + return + } + opts.Namespace = koperatorLocalHelmDescriptor.Namespace + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "patch", kafkaKind, tsResizeClusterName, + "--type=merge", `--patch={"metadata":{"finalizers":[]}}`, + ) + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "delete", kafkaKind, tsResizeClusterName, "--ignore-not-found", + ) + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "delete", "persistentvolumeclaims", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", + ) + _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + "delete", "pods", + "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), + "--ignore-not-found", "--grace-period=0", + ) + }) + ginkgo.It("Acquiring K8s config and context", func() { kubectlOptions, err = kubectlOptionsForCurrentContext() gomega.Expect(err).NotTo(gomega.HaveOccurred()) From a028b60bf127ead8c3924260424477446b94356a Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Fri, 8 May 2026 14:09:56 +0300 Subject: [PATCH 16/21] agent review- e2e tests --- pkg/resources/kafka/kafka.go | 6 ++- pkg/resources/kafka/kafka_test.go | 27 ++++++++----- tests/e2e/test_tiered_storage_cache_resize.go | 39 ++++++++++--------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 663852ef9..19a1ebf18 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1678,7 +1678,11 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if err := r.Create(ctx, desiredPvc); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } - if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + // Only write Active when this is a genuinely new PVC. If pending-deletion is already + // set (crash-recovery: prior Create timed out), preserve the in-flight state so + // handleBrokerCacheResizeCleanup can still delete the old PVC on the next cycle. + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue && + r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] != banzaiv1beta1.TieredCacheVolumePendingDeletion { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { return errorfactory.New(errorfactory.StatusUpdateError{}, err, diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 176b80e59..168e43884 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1417,6 +1417,9 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { expectedCreatePvc bool expectedDeletePvc bool expectedError bool + // expectedTieredCacheState, when non-empty, asserts the final TieredCacheVolumes[mountPath] + // value after reconcileKafkaPvc returns. + expectedTieredCacheState v1beta1.TieredCacheVolumeState }{ { // Pod is up, no prior resize state: record TieredCacheVolumePendingDeletion in brokerState, @@ -1496,15 +1499,16 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { // failed before completing. Pod is still running, so the old PVC must NOT be // deleted and state must NOT be cleared. The reconciler must re-create the // replacement. - testName: "pending-deletion with running pod and no replacement — replacement re-staged", - existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), - desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), - existingPods: []corev1.Pod{runningPod}, - initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, - expectedUpdatePvc: false, - expectedCreatePvc: true, - expectedDeletePvc: false, - expectedError: false, + testName: "pending-deletion with running pod and no replacement — replacement re-staged", + existingPvc: makeTieredCachePvc("cache-pvc-1", "100Gi"), + desiredPvc: makeTieredCachePvc("cache-pvc-new", "50Gi"), + existingPods: []corev1.Pod{runningPod}, + initialTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, + expectedUpdatePvc: false, + expectedCreatePvc: true, + expectedDeletePvc: false, + expectedError: false, + expectedTieredCacheState: v1beta1.TieredCacheVolumePendingDeletion, }, { // Idempotency (symmetric): pending-deletion state and a replacement at desired @@ -1646,6 +1650,11 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { } else { assert.Nil(t, err, "expected no error but got: %v", err) } + if test.expectedTieredCacheState != "" { + finalState := kafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] + assert.Equal(t, test.expectedTieredCacheState, finalState, + "TieredCacheVolumes[%s] after reconcile", mountPath) + } }) } } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 00eaae5f7..273858d63 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -45,13 +45,14 @@ const ( // pvcItem is a minimal representation of a PVC for assertion helpers. type pvcItem struct { - Name string - Annotations map[string]string - StorageSize string - Phase string + Name string + Annotations map[string]string + StorageSize string + Phase string + IsTerminating bool } -// getCacheResizeState returns the CacheVolumeStates entry for the given broker and mount path +// getCacheResizeState returns the tieredCacheVolumes entry for the given broker and mount path // from the KafkaCluster CR status, or an empty string if not set. func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerID, mountPath string) (string, error) { rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, @@ -65,7 +66,7 @@ func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerI var cr struct { Status struct { BrokersState map[string]struct { - CacheVolumeStates map[string]string `json:"cacheVolumeStates"` + TieredCacheVolumes map[string]string `json:"tieredCacheVolumes"` } `json:"brokersState"` } `json:"status"` } @@ -77,7 +78,7 @@ func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerI if !ok { return "", nil } - return brokerState.CacheVolumeStates[mountPath], nil + return brokerState.TieredCacheVolumes[mountPath], nil } // listBrokerCachePVCs returns PVCs for broker tsResizeBrokerID that have the @@ -97,8 +98,9 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { var pvcList struct { Items []struct { Metadata struct { - Name string `json:"name"` - Annotations map[string]string `json:"annotations"` + Name string `json:"name"` + Annotations map[string]string `json:"annotations"` + DeletionTimestamp *string `json:"deletionTimestamp"` } `json:"metadata"` Spec struct { Resources struct { @@ -120,10 +122,11 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { for _, item := range pvcList.Items { if item.Metadata.Annotations["mountPath"] == tsResizeCacheMountPath { result = append(result, pvcItem{ - Name: item.Metadata.Name, - Annotations: item.Metadata.Annotations, - StorageSize: item.Spec.Resources.Requests.Storage, - Phase: item.Status.Phase, + Name: item.Metadata.Name, + Annotations: item.Metadata.Annotations, + StorageSize: item.Spec.Resources.Requests.Storage, + Phase: item.Status.Phase, + IsTerminating: item.Metadata.DeletionTimestamp != nil, }) } } @@ -325,7 +328,7 @@ func testTieredStorageCachePvcResize() bool { // Two acceptable success paths — the resize can complete in seconds, often faster // than the polling interval, so insisting on the intermediate "pending-deletion" // snapshot makes this phase flake-prone: - // A) In-progress: cacheVolumeStates[mp]=="pending-deletion" AND ≥2 cache PVCs. + // A) In-progress: tieredCacheVolumes[mp]=="pending-deletion" AND ≥2 cache PVCs. // B) Already completed: state cleared AND exactly one cache PVC at the shrunk size. // Either proves the resize was staged correctly; Phase 2/3 cover the remaining // invariants regardless of which path we observed. @@ -346,7 +349,7 @@ func testTieredStorageCachePvcResize() bool { if state == "" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { return nil } - return fmt.Errorf("not at expected staging state: cacheVolumeStates[%s]=%q, %d cache PVC(s)", + return fmt.Errorf("not at expected staging state: tieredCacheVolumes[%s]=%q, %d cache PVC(s)", tsResizeCacheMountPath, state, len(pvcs)) }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) }) @@ -378,7 +381,7 @@ func testTieredStorageCachePvcResize() bool { activePvcs := make([]pvcItem, 0, len(pvcs)) for _, pvc := range pvcs { // Ignore PVCs that are being deleted (DeletionTimestamp set but not yet gone). - if pvc.Phase != "" { + if !pvc.IsTerminating { activePvcs = append(activePvcs, pvc) } } @@ -393,7 +396,7 @@ func testTieredStorageCachePvcResize() bool { return err } if state != "" { - return fmt.Errorf("expected cacheVolumeStates[%s] to be cleared, got %q", + return fmt.Errorf("expected tieredCacheVolumes[%s] to be cleared, got %q", tsResizeCacheMountPath, state) } return nil @@ -414,7 +417,7 @@ func testTieredStorageCachePvcResize() bool { state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(state).To(gomega.BeEmpty(), "cacheVolumeStates entry should be cleared after resize") + gomega.Expect(state).To(gomega.BeEmpty(), "tieredCacheVolumes entry should be cleared after resize") }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { From 76dbce1728d2845338d6034f6c81416e7abb4d7c Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Fri, 8 May 2026 17:22:11 +0300 Subject: [PATCH 17/21] agent review- e2e tests --- tests/e2e/test_tiered_storage_cache_resize.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index 273858d63..e826646cf 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -329,7 +329,9 @@ func testTieredStorageCachePvcResize() bool { // than the polling interval, so insisting on the intermediate "pending-deletion" // snapshot makes this phase flake-prone: // A) In-progress: tieredCacheVolumes[mp]=="pending-deletion" AND ≥2 cache PVCs. - // B) Already completed: state cleared AND exactly one cache PVC at the shrunk size. + // B) Already completed: state is "active" (no resize in flight) AND exactly one + // cache PVC at the shrunk size. "active" is the steady-state after a resize; + // "" (absent) would mean the mount path was removed entirely. // Either proves the resize was staged correctly; Phase 2/3 cover the remaining // invariants regardless of which path we observed. ginkgo.By("Waiting until the resize is observable as in-progress or completed") @@ -346,7 +348,7 @@ func testTieredStorageCachePvcResize() bool { if state == "pending-deletion" && len(pvcs) >= 2 { return nil } - if state == "" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { + if state == "active" && len(pvcs) == 1 && pvcs[0].StorageSize == tsResizeShrunkSize { return nil } return fmt.Errorf("not at expected staging state: tieredCacheVolumes[%s]=%q, %d cache PVC(s)", @@ -389,15 +391,15 @@ func testTieredStorageCachePvcResize() bool { return fmt.Errorf("expected 1 active cache PVC for broker %d after pod restart, got %d", tsResizeBrokerID, len(activePvcs)) } - // The resize state should be cleared once the old PVC is gone. + // After cleanup the resize state transitions back to "active" — no longer pending-deletion. state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) if err != nil { return err } - if state != "" { - return fmt.Errorf("expected tieredCacheVolumes[%s] to be cleared, got %q", - tsResizeCacheMountPath, state) + if state == "pending-deletion" { + return fmt.Errorf("tieredCacheVolumes[%s] still pending-deletion after old PVC deleted", + tsResizeCacheMountPath) } return nil }, tsResizePhaseTimeout, tsResizePollingInterval).ShouldNot(gomega.HaveOccurred()) @@ -417,7 +419,7 @@ func testTieredStorageCachePvcResize() bool { state, err := getCacheResizeState(kubectlOptions, tsResizeClusterName, fmt.Sprintf("%d", tsResizeBrokerID), tsResizeCacheMountPath) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(state).To(gomega.BeEmpty(), "tieredCacheVolumes entry should be cleared after resize") + gomega.Expect(state).To(gomega.Equal("active"), "tieredCacheVolumes entry should be active after resize completes") }) ginkgo.It("Verifying the surviving cache PVC has the new size "+tsResizeShrunkSize, func() { From e828af36dc9729b5490bc1f43d28eccfa091e8d4 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 13 May 2026 16:00:45 +0300 Subject: [PATCH 18/21] agent review --- pkg/resources/kafka/kafka.go | 56 ++++++++++++++++++--- pkg/webhooks/kafkacluster_validator.go | 54 +++++++++++++++++--- pkg/webhooks/kafkacluster_validator_test.go | 49 ++++++++++++++++++ 3 files changed, 145 insertions(+), 14 deletions(-) diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index 19a1ebf18..f6cf30765 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -167,22 +167,64 @@ func getCreatedPvcForBroker( // 1. Always exclude terminating PVCs (DeletionTimestamp set) — they are released from pods // and should not be mounted by the new broker pod. // 2. For mount paths with an in-flight cache resize (pendingDeletionMountPaths), two PVCs - // temporarily coexist (old + replacement). Keep only one so the pod spec has a unique - // mount path (the reconciler will use the correctly-sized one during PVC reconciliation). - seenMountPaths := make(map[string]bool) + // temporarily coexist (old + replacement). Deterministically select the one whose storage + // request matches the desired size; fall back to the lexicographically smallest name so + // the pod-spec is stable across reconcile cycles regardless of API list order. + + // Build desired-size map for pending-deletion mount paths. + desiredSizeByMountPath := make(map[string]int64) + for i := range storageConfigs { + if storageConfigs[i].PvcSpec != nil { + if qty := storageConfigs[i].PvcSpec.Resources.Requests.Storage(); qty != nil { + desiredSizeByMountPath[storageConfigs[i].MountPath] = qty.Value() + } + } + } + + // First pass: for each pending-deletion mount path, pick the best PVC to keep. + // Preference order: (1) PVC whose size matches desired, (2) lexicographically smallest name. + bestIdx := make(map[string]int) // mountPath -> index of selected PVC + bestMatchesSize := make(map[string]bool) // mountPath -> whether selected PVC matches desired size + for i := range foundPvcList.Items { + pvc := &foundPvcList.Items[i] + if pvc.DeletionTimestamp != nil { + continue + } + mp := pvc.Annotations["mountPath"] + if !pendingDeletionMountPaths[mp] { + continue + } + qty := pvc.Spec.Resources.Requests.Storage() + matchesDesired := qty != nil && qty.Value() == desiredSizeByMountPath[mp] + if _, seen := bestIdx[mp]; !seen { + bestIdx[mp] = i + bestMatchesSize[mp] = matchesDesired + continue + } + if matchesDesired && !bestMatchesSize[mp] { + // This PVC matches desired size; the current best does not — prefer this one. + bestIdx[mp] = i + bestMatchesSize[mp] = true + } else if matchesDesired == bestMatchesSize[mp] && pvc.Name < foundPvcList.Items[bestIdx[mp]].Name { + // Same category; prefer lexicographically smaller name for stability. + bestIdx[mp] = i + } + } + + // Second pass: apply filters. n := 0 - for _, pvc := range foundPvcList.Items { + for i := range foundPvcList.Items { + pvc := &foundPvcList.Items[i] if pvc.DeletionTimestamp != nil { continue } mp := pvc.Annotations["mountPath"] if pendingDeletionMountPaths[mp] { - if seenMountPaths[mp] { + if idx, ok := bestIdx[mp]; !ok || idx != i { continue } - seenMountPaths[mp] = true } - foundPvcList.Items[n] = pvc + foundPvcList.Items[n] = *pvc n++ } foundPvcList.Items = foundPvcList.Items[:n] diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index 5c7b9fbde..735525ce2 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -85,22 +85,31 @@ func (s KafkaClusterValidator) ValidateDelete(_ context.Context, _ *banzaicloudv } // checkTieredStorageCacheImmutability rejects updates that change the tieredStorageCache -// classification of an existing PVC. It uses status.BrokersState[brokerID].TieredCacheVolumes -// as the authoritative source of truth — the reconciler writes this map when it creates each PVC. -// Comparing against committed status catches all bypass paths (in-place flip, group-switch, -// inline-shadow override) without requiring spec-level merging or effective-config resolution. -// Removing a mountPath from the spec is allowed (delete-and-re-add path). +// classification of an existing PVC. Two complementary checks are applied: +// +// 1. Status-based (true→false): uses status.BrokersState[brokerID].TieredCacheVolumes as the +// authoritative source of truth for cache volumes — the reconciler writes this map when it +// creates each PVC. Catches all bypass paths (in-place flip, group-switch, inline-shadow). +// +// 2. Spec-based (false→true): iterates the old spec to find non-cache mount paths and rejects +// any update that would flip tieredStorageCache from false to true on an already-provisioned +// log-dir volume. Non-cache PVCs have no status entry, so the status-based check alone +// would silently allow the flip — the broker would restart and drop the path from log.dirs, +// losing access to the existing Kafka data on that volume. +// +// Removing a mountPath from the spec is allowed in both cases (delete-and-re-add path). func checkTieredStorageCacheImmutability(oldCluster, newCluster *banzaicloudv1beta1.KafkaCluster) field.ErrorList { var allErrs field.ErrorList + // Check 1: status-based — existing cache volumes must not be reclassified as non-cache. for brokerIDStr, brokerState := range oldCluster.Status.BrokersState { for mountPath, state := range brokerState.TieredCacheVolumes { if state == "" { - continue // not a cache PVC + continue } newValue, fieldPath, found := findTieredStorageCacheInSpec(&newCluster.Spec, brokerIDStr, mountPath) if !found { - continue // mountPath removed from spec — remove-and-re-add is allowed + continue // mountPath removed from spec — allowed } if !newValue { allErrs = append(allErrs, field.Forbidden(fieldPath, immutableTieredStorageCacheErrMsg)) @@ -108,6 +117,37 @@ func checkTieredStorageCacheImmutability(oldCluster, newCluster *banzaicloudv1be } } + // Check 2: spec-based — existing non-cache volumes must not be reclassified as cache. + for _, broker := range oldCluster.Spec.Brokers { + brokerIDStr := strconv.Itoa(int(broker.Id)) + + mountPaths := make(map[string]struct{}) + if broker.BrokerConfig != nil { + for _, sc := range broker.BrokerConfig.StorageConfigs { + mountPaths[sc.MountPath] = struct{}{} + } + } + if broker.BrokerConfigGroup != "" { + if group, ok := oldCluster.Spec.BrokerConfigGroups[broker.BrokerConfigGroup]; ok { + for _, sc := range group.StorageConfigs { + mountPaths[sc.MountPath] = struct{}{} + } + } + } + + for mountPath := range mountPaths { + oldValue, _, oldFound := findTieredStorageCacheInSpec(&oldCluster.Spec, brokerIDStr, mountPath) + if !oldFound || oldValue { + continue // not present or already cache — handled by check 1 + } + newValue, fieldPath, newFound := findTieredStorageCacheInSpec(&newCluster.Spec, brokerIDStr, mountPath) + if !newFound || !newValue { + continue // mountPath removed or unchanged + } + allErrs = append(allErrs, field.Forbidden(fieldPath, immutableTieredStorageCacheErrMsg)) + } + } + return allErrs } diff --git a/pkg/webhooks/kafkacluster_validator_test.go b/pkg/webhooks/kafkacluster_validator_test.go index 7d81acf69..52b603be2 100644 --- a/pkg/webhooks/kafkacluster_validator_test.go +++ b/pkg/webhooks/kafkacluster_validator_test.go @@ -403,6 +403,55 @@ func TestCheckTieredStorageCacheImmutability(t *testing.T) { }}, expected: nil, }, + { + // Existing log-dir volume flipped to cache via inline config. + // The old spec has tieredStorageCache=false, the new spec flips it to true. + // The broker has no TieredCacheVolumes status entry (it is a log dir, not a cache), + // so only the spec-based check catches this. + testName: "false→true flip on existing log-dir volume rejected (inline)", + oldCluster: &v1beta1.KafkaCluster{ + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", false)}, + }}}, + }, + }, + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfig: &v1beta1.BrokerConfig{ + StorageConfigs: []v1beta1.StorageConfig{sc("/data", true)}, + }}}, + }}, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokers").Index(0).Child("brokerConfig").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, + { + // Existing log-dir volume flipped to cache via brokerConfigGroup. + testName: "false→true flip on existing log-dir volume rejected (group)", + oldCluster: &v1beta1.KafkaCluster{ + Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfigGroup: "default"}}, + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{sc("/data", false)}}, + }, + }, + }, + newCluster: &v1beta1.KafkaCluster{Spec: v1beta1.KafkaClusterSpec{ + Brokers: []v1beta1.Broker{{Id: 0, BrokerConfigGroup: "default"}}, + BrokerConfigGroups: map[string]v1beta1.BrokerConfig{ + "default": {StorageConfigs: []v1beta1.StorageConfig{sc("/data", true)}}, + }, + }}, + expected: append(field.ErrorList{}, + field.Forbidden( + field.NewPath("spec").Child("brokerConfigGroups").Key("default").Child("storageConfigs").Index(0).Child("tieredStorageCache"), + immutableTieredStorageCacheErrMsg, + ), + ), + }, } for _, testCase := range testCases { From 2f747f108051f40af076d589fa35ff39ff6f0950 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 17 Jun 2026 15:26:09 +0300 Subject: [PATCH 19/21] agent review - fix nil panics, constants and observability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix nil-dereference in isDesiredStorageValueInvalid: guard both Storage() calls before calling .Value(), consistent with all other new call sites in this branch - Add TieredCacheVolumeRemoved sentinel constant to replace bare "" assignments in UpdateBrokerStatus calls; update all consumers in status.go and kafkacluster_validator.go - Replace ~11 bare "mountPath" annotation key literals with the mountPathAnnotationKey constant across all new functions - Add log.Info on pod-down pending-deletion→active state transition (the normal resize completion path was previously silent) - Fix ValidateUpdate webhook signature to use concrete *KafkaCluster types instead of runtime.Object; remove the now-unused runtime import Co-Authored-By: Claude Sonnet 4.6 --- api/v1beta1/common_types.go | 4 ++ pkg/k8sutil/status.go | 2 +- pkg/resources/kafka/kafka.go | 69 +++++++++++++++----------- pkg/resources/kafka/pvc.go | 2 +- pkg/webhooks/kafkacluster_validator.go | 7 +-- 5 files changed, 49 insertions(+), 35 deletions(-) diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index 57edfaf52..0f3234b37 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -227,6 +227,10 @@ const ( // to be deleted once the broker pod stops. A replacement PVC with the new desired size has // already been created at the same mount path. TieredCacheVolumePendingDeletion TieredCacheVolumeState = "pending-deletion" + // TieredCacheVolumeRemoved is the sentinel value passed to UpdateBrokerStatus to delete a + // TieredCacheVolumes map entry. The zero value of the type is intentionally used so that + // callers are explicit rather than relying on an untyped empty string. + TieredCacheVolumeRemoved TieredCacheVolumeState = "" ) // BrokerState holds information about broker state diff --git a/pkg/k8sutil/status.go b/pkg/k8sutil/status.go index f287727ed..b98d14844 100644 --- a/pkg/k8sutil/status.go +++ b/pkg/k8sutil/status.go @@ -179,7 +179,7 @@ func generateBrokerState(brokerIDs []string, cluster *banzaicloudv1beta1.KafkaCl brokerState.TieredCacheVolumes = make(map[string]banzaicloudv1beta1.TieredCacheVolumeState) } for mountPath, state := range s { - if state == "" { + if state == banzaicloudv1beta1.TieredCacheVolumeRemoved { delete(brokerState.TieredCacheVolumes, mountPath) } else { brokerState.TieredCacheVolumes[mountPath] = state diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index f6cf30765..a75a8ac8c 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -67,9 +67,6 @@ const ( brokerConfigTemplate = "%s-config" brokerStorageTemplate = "%s-%d-storage-%d-" - // annotationTrue is the string value used for boolean-true annotations and config comparisons. - annotationTrue = "true" - brokerConfigMapVolumeMount = "broker-config" kafkaDataVolumeMount = "kafka-data" @@ -190,7 +187,7 @@ func getCreatedPvcForBroker( if pvc.DeletionTimestamp != nil { continue } - mp := pvc.Annotations["mountPath"] + mp := pvc.Annotations[mountPathAnnotationKey] if !pendingDeletionMountPaths[mp] { continue } @@ -218,7 +215,7 @@ func getCreatedPvcForBroker( if pvc.DeletionTimestamp != nil { continue } - mp := pvc.Annotations["mountPath"] + mp := pvc.Annotations[mountPathAnnotationKey] if pendingDeletionMountPaths[mp] { if idx, ok := bestIdx[mp]; !ok || idx != i { continue @@ -1275,7 +1272,7 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro desiredMountPaths := make(map[string]bool, len(desiredPvcs)) for _, dp := range desiredPvcs { - desiredMountPaths[dp.Annotations["mountPath"]] = true + desiredMountPaths[dp.Annotations[mountPathAnnotationKey]] = true } skipBroker, err := r.handleBrokerCacheResizeCleanup(ctx, log, brokerId, pvcList, desiredPvcs, desiredMountPaths, brokerPodExists, matchingLabels) @@ -1334,11 +1331,11 @@ func (r *Reconciler) reconcileKafkaPvc(ctx context.Context, log logr.Logger, bro // using status.BrokersState as the authoritative source of truth (written at PVC creation). // Falls back to the PVC annotation for PVCs created before TieredCacheVolumes was introduced. func (r *Reconciler) isBrokerPVCTieredCache(pvc *corev1.PersistentVolumeClaim, brokerId string) bool { - mountPath := pvc.Annotations["mountPath"] + mountPath := pvc.Annotations[mountPathAnnotationKey] if state := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath]; state != "" { return true } - return pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue + return pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == configValueTrue } // deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired @@ -1360,7 +1357,7 @@ func (r *Reconciler) deleteRemovedCachePVCs( if !r.isBrokerPVCTieredCache(pvc, brokerId) { continue } - mountPath := pvc.Annotations["mountPath"] + mountPath := pvc.Annotations[mountPathAnnotationKey] if desiredMountPaths[mountPath] { continue } @@ -1375,7 +1372,7 @@ func (r *Reconciler) deleteRemovedCachePVCs( } // Clear status regardless of PVC termination state: once deletion has been // requested (DeletionTimestamp set), the status entry is no longer needed. - pathsToUntrack[mountPath] = "" + pathsToUntrack[mountPath] = banzaiv1beta1.TieredCacheVolumeRemoved } // Second pass: clear status entries for mount paths that are no longer desired @@ -1388,7 +1385,7 @@ func (r *Reconciler) deleteRemovedCachePVCs( if _, alreadyHandled := pathsToUntrack[mountPath]; !alreadyHandled { log.Info("Clearing stale tiered cache volume status for already-deleted mount path", "brokerId", brokerId, "mountPath", mountPath) - pathsToUntrack[mountPath] = "" + pathsToUntrack[mountPath] = banzaiv1beta1.TieredCacheVolumeRemoved } } @@ -1410,7 +1407,7 @@ func (r *Reconciler) effectivePvcCount(brokerId string, pvcList *corev1.Persiste counted := make(map[string]bool) total := 0 for _, pvc := range pvcList.Items { - mp := pvc.Annotations["mountPath"] + mp := pvc.Annotations[mountPathAnnotationKey] if tieredCacheVolumes[mp] == banzaiv1beta1.TieredCacheVolumePendingDeletion { if !counted[mp] { counted[mp] = true @@ -1476,7 +1473,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( desiredSizeByMountPath := make(map[string]int64, len(desiredPvcs)) for _, dp := range desiredPvcs { if s := dp.Spec.Resources.Requests.Storage(); s != nil { - desiredSizeByMountPath[dp.Annotations["mountPath"]] = s.Value() + desiredSizeByMountPath[dp.Annotations[mountPathAnnotationKey]] = s.Value() } } @@ -1499,7 +1496,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( replacementExists := false oldPvcExists := false for _, pvc := range pvcList.Items { - if pvc.Annotations["mountPath"] != mp { + if pvc.Annotations[mountPathAnnotationKey] != mp { continue } if pvcSize := pvc.Spec.Resources.Requests.Storage(); pvcSize != nil { @@ -1526,7 +1523,7 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( // Mount path is no longer desired: delete all PVCs at this path (old + replacement). for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.Annotations["mountPath"] != mp { + if pvc.Annotations[mountPathAnnotationKey] != mp { continue } log.Info("Broker pod is down — deleting orphaned cache resize PVC", @@ -1536,16 +1533,16 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( "deleting orphaned cache resize PVC failed", "pvc", pvc.Name) } } - stateUpdates[mp] = "" // entry removed: volume gone, no longer a cache PVC + stateUpdates[mp] = banzaiv1beta1.TieredCacheVolumeRemoved // entry removed: volume gone, no longer a cache PVC } else { // Mount path still desired: delete the old PVC (the one whose size differs from desired). desiredSize := desiredSizeByMountPath[mp] for i := range pvcList.Items { pvc := &pvcList.Items[i] - if pvc.Annotations["mountPath"] != mp { + if pvc.Annotations[mountPathAnnotationKey] != mp { continue } - if pvc.Spec.Resources.Requests.Storage().Value() != desiredSize { + if pvcStorage := pvc.Spec.Resources.Requests.Storage(); pvcStorage == nil || pvcStorage.Value() != desiredSize { log.Info("Broker pod is down — deleting pending-deletion tiered storage cache PVC", "brokerId", brokerId, "mountPath", mp, "pvc", pvc.Name) if err := r.Delete(ctx, pvc); err != nil && !apierrors.IsNotFound(err) { @@ -1555,6 +1552,8 @@ func (r *Reconciler) handleBrokerCacheResizeCleanup( break } } + log.Info("Tiered storage cache PVC resize complete — clearing resize state after pod-down cleanup", + "brokerId", brokerId, "mountPath", mp) stateUpdates[mp] = banzaiv1beta1.TieredCacheVolumeActive // resize done, still a cache PVC } } @@ -1637,7 +1636,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } log.Info("resource created") - if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue { + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == configValueTrue { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { return errorfactory.New(errorfactory.StatusUpdateError{}, err, @@ -1662,7 +1661,9 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // We identify it as the PVC whose size differs from the desired size while a // pending-deletion resize is in flight for this mount path. if r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] == banzaiv1beta1.TieredCacheVolumePendingDeletion { - if desiredPvc.Spec.Resources.Requests.Storage().Value() != pvc.Spec.Resources.Requests.Storage().Value() { + desiredQty := desiredPvc.Spec.Resources.Requests.Storage() + pvcQty := pvc.Spec.Resources.Requests.Storage() + if desiredQty == nil || pvcQty == nil || desiredQty.Value() != pvcQty.Value() { continue } } @@ -1696,13 +1697,18 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( if err := r.DirectClient.List(ctx, liveList, client.InNamespace(r.KafkaCluster.GetNamespace()), matchingLabels); err != nil { return errorfactory.New(errorfactory.APIFailure{}, err, "uncached list of PVCs failed", "kind", desiredType) } - desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() + desiredQty := desiredPvc.Spec.Resources.Requests.Storage() + if desiredQty == nil { + continue + } + desiredSize := desiredQty.Value() for i := range liveList.Items { p := &liveList.Items[i] - if p.DeletionTimestamp != nil || p.Annotations["mountPath"] != mountPath { + if p.DeletionTimestamp != nil || p.Annotations[mountPathAnnotationKey] != mountPath { continue } - if p.Spec.Resources.Requests.Storage().Value() == desiredSize { + pQty := p.Spec.Resources.Requests.Storage() + if pQty != nil && pQty.Value() == desiredSize { log.Info("Replacement cache PVC already exists from a prior partial attempt; skipping Create", "brokerId", brokerId, "mountPath", mountPath, "pvc", p.Name) alreadyCreated = true @@ -1723,7 +1729,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Only write Active when this is a genuinely new PVC. If pending-deletion is already // set (crash-recovery: prior Create timed out), preserve the in-flight state so // handleBrokerCacheResizeCleanup can still delete the old PVC on the next cycle. - if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == annotationTrue && + if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == configValueTrue && r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] != banzaiv1beta1.TieredCacheVolumePendingDeletion { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { @@ -1753,13 +1759,15 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // (flipping tieredStorageCache: false → true on an existing data volume) route a real // log-dir PVC through the cache-shrink delete-and-recreate path, bypassing graceful drain. isTieredCache := r.isBrokerPVCTieredCache(currentPvc, brokerId) - desiredSize := desiredPvc.Spec.Resources.Requests.Storage().Value() - currentSize := currentPvc.Spec.Resources.Requests.Storage().Value() + desiredQty := desiredPvc.Spec.Resources.Requests.Storage() + currentQty := currentPvc.Spec.Resources.Requests.Storage() // Tiered storage cache PVC shrink: stage the replacement PVC immediately so // provisioning runs in parallel with rolling-upgrade gate evaluation. // The old PVC is annotated pending-deletion and removed once the broker pod stops. - if isTieredCache && desiredSize < currentSize { + if isTieredCache && desiredQty != nil && currentQty != nil && desiredQty.Value() < currentQty.Value() { + desiredSize := desiredQty.Value() + currentSize := currentQty.Value() if err := r.stageTieredCachePVCShrink(ctx, log, brokerId, mountPath, currentPvc, desiredPvc, currentSize, desiredSize); err != nil { return err } @@ -1931,7 +1939,12 @@ func GetBrokersWithPendingOrRunningCCTask(kafkaCluster *banzaiv1beta1.KafkaClust } func isDesiredStorageValueInvalid(desired, current *corev1.PersistentVolumeClaim) bool { - return desired.Spec.Resources.Requests.Storage().Value() < current.Spec.Resources.Requests.Storage().Value() + desiredQty := desired.Spec.Resources.Requests.Storage() + currentQty := current.Spec.Resources.Requests.Storage() + if desiredQty == nil || currentQty == nil { + return false + } + return desiredQty.Value() < currentQty.Value() } func (r *Reconciler) getBrokerHost(log logr.Logger, defaultHost string, broker banzaiv1beta1.Broker, eListener banzaiv1beta1.ExternalListenerConfig, iConfig banzaiv1beta1.IngressConfig) (string, error) { diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index 0451bf24a..ee16808ea 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -66,7 +66,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora // Mark tiered storage cache PVCs with annotation for special handling if storage.TieredStorageCache { - annotations[v1beta1.TieredStorageCacheAnnotationKey] = annotationTrue + annotations[v1beta1.TieredStorageCacheAnnotationKey] = configValueTrue } return &corev1.PersistentVolumeClaim{ diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index 735525ce2..5e6ef71bd 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -25,7 +25,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/validation/field" "github.com/go-logr/logr" @@ -38,10 +37,8 @@ type KafkaClusterValidator struct { Log logr.Logger } -func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (warnings admission.Warnings, err error) { +func (s KafkaClusterValidator) ValidateUpdate(ctx context.Context, kafkaClusterOld, kafkaClusterNew *banzaicloudv1beta1.KafkaCluster) (warnings admission.Warnings, err error) { var allErrs field.ErrorList - kafkaClusterOld := oldObj.(*banzaicloudv1beta1.KafkaCluster) - kafkaClusterNew := newObj.(*banzaicloudv1beta1.KafkaCluster) log := s.Log.WithValues("name", kafkaClusterNew.GetName(), "namespace", kafkaClusterNew.GetNamespace()) listenerErrs := checkInternalAndExternalListeners(&kafkaClusterNew.Spec) @@ -104,7 +101,7 @@ func checkTieredStorageCacheImmutability(oldCluster, newCluster *banzaicloudv1be // Check 1: status-based — existing cache volumes must not be reclassified as non-cache. for brokerIDStr, brokerState := range oldCluster.Status.BrokersState { for mountPath, state := range brokerState.TieredCacheVolumes { - if state == "" { + if state == banzaicloudv1beta1.TieredCacheVolumeRemoved { continue } newValue, fieldPath, found := findTieredStorageCacheInSpec(&newCluster.Spec, brokerIDStr, mountPath) From 9d437d715e21699b67add7d8bd951f05a28af678 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 17 Jun 2026 16:53:55 +0300 Subject: [PATCH 20/21] lint issues fix --- pkg/webhooks/kafkacluster_validator.go | 16 ++++++------ tests/e2e/test_multidisk_removal.go | 2 +- tests/e2e/test_tiered_storage_cache_resize.go | 26 +++++++++---------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pkg/webhooks/kafkacluster_validator.go b/pkg/webhooks/kafkacluster_validator.go index 5e6ef71bd..fef7ec9cf 100644 --- a/pkg/webhooks/kafkacluster_validator.go +++ b/pkg/webhooks/kafkacluster_validator.go @@ -84,15 +84,15 @@ func (s KafkaClusterValidator) ValidateDelete(_ context.Context, _ *banzaicloudv // checkTieredStorageCacheImmutability rejects updates that change the tieredStorageCache // classification of an existing PVC. Two complementary checks are applied: // -// 1. Status-based (true→false): uses status.BrokersState[brokerID].TieredCacheVolumes as the -// authoritative source of truth for cache volumes — the reconciler writes this map when it -// creates each PVC. Catches all bypass paths (in-place flip, group-switch, inline-shadow). +// 1. Status-based (true→false): uses status.BrokersState[brokerID].TieredCacheVolumes as the +// authoritative source of truth for cache volumes — the reconciler writes this map when it +// creates each PVC. Catches all bypass paths (in-place flip, group-switch, inline-shadow). // -// 2. Spec-based (false→true): iterates the old spec to find non-cache mount paths and rejects -// any update that would flip tieredStorageCache from false to true on an already-provisioned -// log-dir volume. Non-cache PVCs have no status entry, so the status-based check alone -// would silently allow the flip — the broker would restart and drop the path from log.dirs, -// losing access to the existing Kafka data on that volume. +// 2. Spec-based (false→true): iterates the old spec to find non-cache mount paths and rejects +// any update that would flip tieredStorageCache from false to true on an already-provisioned +// log-dir volume. Non-cache PVCs have no status entry, so the status-based check alone +// would silently allow the flip — the broker would restart and drop the path from log.dirs, +// losing access to the existing Kafka data on that volume. // // Removing a mountPath from the spec is allowed in both cases (delete-and-re-add path). func checkTieredStorageCacheImmutability(oldCluster, newCluster *banzaicloudv1beta1.KafkaCluster) field.ErrorList { diff --git a/tests/e2e/test_multidisk_removal.go b/tests/e2e/test_multidisk_removal.go index a02db08e7..ec8b1895c 100644 --- a/tests/e2e/test_multidisk_removal.go +++ b/tests/e2e/test_multidisk_removal.go @@ -135,7 +135,7 @@ func getBrokerConfigMapLogDirs(kubectlOptions k8s.KubectlOptions, configMapName "-n", namespace, "-o", fmt.Sprintf("jsonpath={.data.%s}", kafkautils.ConfigPropertyName), } - output, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, args...) + output, err := k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, args...) if err != nil { return nil, fmt.Errorf("getting configmap %s: %w", configMapName, err) } diff --git a/tests/e2e/test_tiered_storage_cache_resize.go b/tests/e2e/test_tiered_storage_cache_resize.go index e826646cf..7adf8e645 100644 --- a/tests/e2e/test_tiered_storage_cache_resize.go +++ b/tests/e2e/test_tiered_storage_cache_resize.go @@ -55,7 +55,7 @@ type pvcItem struct { // getCacheResizeState returns the tieredCacheVolumes entry for the given broker and mount path // from the KafkaCluster CR status, or an empty string if not set. func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerID, mountPath string) (string, error) { - rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + rawOutput, err := k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "get", kafkaKind, clusterName, "--output", "json", ) @@ -86,7 +86,7 @@ func getCacheResizeState(kubectlOptions k8s.KubectlOptions, clusterName, brokerI func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { selector := fmt.Sprintf("%s=%s,brokerId=%d", kafkaCRLabelKey, tsResizeClusterName, tsResizeBrokerID) - rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + rawOutput, err := k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "get", "persistentvolumeclaims", "-l", selector, "--output", "json", @@ -136,7 +136,7 @@ func listBrokerCachePVCs(kubectlOptions k8s.KubectlOptions) ([]pvcItem, error) { // getBrokerPodUID returns the UID of the running (non-terminating) broker pod for the // given broker ID, or an error if no such pod is found. func getBrokerPodUID(kubectlOptions k8s.KubectlOptions, clusterName string, brokerID int) (string, error) { - rawOutput, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + rawOutput, err := k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "get", "pod", "-l", fmt.Sprintf("%s=%s,brokerId=%d,app=kafka", kafkaCRLabelKey, clusterName, brokerID), "--output", "json", @@ -192,19 +192,19 @@ func testTieredStorageCachePvcResize() bool { return } opts.Namespace = koperatorLocalHelmDescriptor.Namespace - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &opts, "patch", kafkaKind, tsResizeClusterName, "--type=merge", `--patch={"metadata":{"finalizers":[]}}`, ) - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &opts, "delete", kafkaKind, tsResizeClusterName, "--ignore-not-found", ) - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &opts, "delete", "persistentvolumeclaims", "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), "--ignore-not-found", ) - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &opts, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &opts, "delete", "pods", "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), "--ignore-not-found", "--grace-period=0", @@ -225,30 +225,30 @@ func testTieredStorageCachePvcResize() bool { // owned resource so nothing blocks the subsequent fresh install. ginkgo.It("Pre-cleanup: removing any leftover kafka-ts-resize cluster", func() { // 1. Remove finalizers so the CR can be deleted regardless of operator state. - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "patch", kafkaKind, tsResizeClusterName, "--type=merge", `--patch={"metadata":{"finalizers":[]}}`, ) // 2. Delete the CR itself (ignore not-found). - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "delete", kafkaKind, tsResizeClusterName, "--ignore-not-found", ) // 3. Explicitly delete PVCs — they carry a pvc-protection finalizer that // blocks cascade GC while pods are still bound. - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "delete", "persistentvolumeclaims", "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), "--ignore-not-found", ) // 4. Delete pods so they release PVC mounts promptly. - _, _ = k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + _, _ = k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "delete", "pods", "-l", fmt.Sprintf("%s=%s", kafkaCRLabelKey, tsResizeClusterName), "--ignore-not-found", "--grace-period=0", ) // 5. Wait for the CR itself to be gone (GC will handle the rest). gomega.Eventually(context.Background(), func() error { - out, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + out, err := k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "get", kafkaKind, tsResizeClusterName, ) if err != nil || strings.Contains(out, "NotFound") || strings.Contains(out, "not found") { @@ -268,7 +268,7 @@ func testTieredStorageCachePvcResize() bool { // resize — we don't gate on ClusterRunning because Cruise Control may still be // completing the post-startup rebalance when the resize is initiated. gomega.Eventually(context.Background(), func() error { - output, err := k8s.RunKubectlAndGetOutputE(ginkgo.GinkgoT(), &kubectlOptions, + output, err := k8s.RunKubectlAndGetOutputContextE(ginkgo.GinkgoT(), context.Background(), &kubectlOptions, "get", "pod", "-l", fmt.Sprintf("%s=%s,app=kafka,isBrokerNode=true", kafkaCRLabelKey, tsResizeClusterName), "--output", "json", From 7f6d0c042d185ffa8918061ccda3b71d0d881295 Mon Sep 17 00:00:00 2001 From: Eduard Agarici Date: Wed, 17 Jun 2026 17:53:31 +0300 Subject: [PATCH 21/21] switch tieredStorageCache from annotation to label --- api/v1beta1/kafkacluster_types.go | 10 ++++++---- pkg/resources/kafka/kafka.go | 6 +++--- pkg/resources/kafka/kafka_test.go | 10 +++++----- pkg/resources/kafka/pvc.go | 6 +++--- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/api/v1beta1/kafkacluster_types.go b/api/v1beta1/kafkacluster_types.go index 89ada878a..bc379d421 100644 --- a/api/v1beta1/kafkacluster_types.go +++ b/api/v1beta1/kafkacluster_types.go @@ -50,11 +50,13 @@ const ( // IsControllerNodeKey is used to identify if the kafka pod is a controller or broker_controller IsControllerNodeKey = "isControllerNode" - // TieredStorageCacheAnnotationKey marks a PVC as backing a Kafka tiered storage cache. + // TieredStorageCacheLabelKey marks a PVC as backing a Kafka tiered storage cache. // Cache PVCs are excluded from log.dirs and Cruise Control capacity, and follow a - // delete-and-recreate path on shrink. Read sites must use this constant rather than the - // literal so a typo cannot silently misclassify a cache PVC as a regular log-dir volume. - TieredStorageCacheAnnotationKey = "tieredStorageCache" + // delete-and-recreate path on shrink. Using a label (rather than an annotation) allows + // direct kubectl selection: kubectl get pvc -l tieredStorageCache=true + // Read sites must use this constant rather than the literal so a typo cannot silently + // misclassify a cache PVC as a regular log-dir volume. + TieredStorageCacheLabelKey = "tieredStorageCache" // DefaultCruiseControlImage is the default CC image used when users don't specify it in CruiseControlConfig.Image DefaultCruiseControlImage = "adobe/cruise-control:3.0.3-adbe-20250804" // renovate: datasource=docker depName=adobe/cruise-control diff --git a/pkg/resources/kafka/kafka.go b/pkg/resources/kafka/kafka.go index a75a8ac8c..ba77a9b15 100644 --- a/pkg/resources/kafka/kafka.go +++ b/pkg/resources/kafka/kafka.go @@ -1335,7 +1335,7 @@ func (r *Reconciler) isBrokerPVCTieredCache(pvc *corev1.PersistentVolumeClaim, b if state := r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath]; state != "" { return true } - return pvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == configValueTrue + return pvc.Labels[banzaiv1beta1.TieredStorageCacheLabelKey] == configValueTrue } // deleteRemovedCachePVCs deletes tiered storage cache PVCs whose mount path is no longer desired @@ -1636,7 +1636,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( return errorfactory.New(errorfactory.APIFailure{}, err, "creating resource failed", "kind", desiredType) } log.Info("resource created") - if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == configValueTrue { + if desiredPvc.Labels[banzaiv1beta1.TieredStorageCacheLabelKey] == configValueTrue { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { return errorfactory.New(errorfactory.StatusUpdateError{}, err, @@ -1729,7 +1729,7 @@ func (r *Reconciler) reconcileDesiredPvcsForBroker( // Only write Active when this is a genuinely new PVC. If pending-deletion is already // set (crash-recovery: prior Create timed out), preserve the in-flight state so // handleBrokerCacheResizeCleanup can still delete the old PVC on the next cycle. - if desiredPvc.Annotations[banzaiv1beta1.TieredStorageCacheAnnotationKey] == configValueTrue && + if desiredPvc.Labels[banzaiv1beta1.TieredStorageCacheLabelKey] == configValueTrue && r.KafkaCluster.Status.BrokersState[brokerId].TieredCacheVolumes[mountPath] != banzaiv1beta1.TieredCacheVolumePendingDeletion { if err := k8sutil.UpdateBrokerStatus(r.Client, []string{brokerId}, r.KafkaCluster, map[string]banzaiv1beta1.TieredCacheVolumeState{mountPath: banzaiv1beta1.TieredCacheVolumeActive}, log); err != nil { diff --git a/pkg/resources/kafka/kafka_test.go b/pkg/resources/kafka/kafka_test.go index 168e43884..fb13a97d2 100644 --- a/pkg/resources/kafka/kafka_test.go +++ b/pkg/resources/kafka/kafka_test.go @@ -1365,13 +1365,13 @@ func TestReconcileKafkaPvcTieredCacheResize(t *testing.T) { Name: name, Namespace: namespace, Labels: map[string]string{ - v1beta1.BrokerIdLabelKey: brokerId, - v1beta1.AppLabelKey: "kafka", - v1beta1.KafkaCRLabelKey: clusterName, + v1beta1.BrokerIdLabelKey: brokerId, + v1beta1.AppLabelKey: "kafka", + v1beta1.KafkaCRLabelKey: clusterName, + v1beta1.TieredStorageCacheLabelKey: "true", }, Annotations: map[string]string{ - "mountPath": mountPath, - v1beta1.TieredStorageCacheAnnotationKey: "true", + "mountPath": mountPath, }, }, Spec: corev1.PersistentVolumeClaimSpec{ diff --git a/pkg/resources/kafka/pvc.go b/pkg/resources/kafka/pvc.go index ee16808ea..53df077a7 100644 --- a/pkg/resources/kafka/pvc.go +++ b/pkg/resources/kafka/pvc.go @@ -64,9 +64,9 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora annotations := map[string]string{mountPathAnnotationKey: storage.MountPath} - // Mark tiered storage cache PVCs with annotation for special handling + extraLabels := map[string]string{v1beta1.BrokerIdLabelKey: fmt.Sprintf("%d", brokerId)} if storage.TieredStorageCache { - annotations[v1beta1.TieredStorageCacheAnnotationKey] = configValueTrue + extraLabels[v1beta1.TieredStorageCacheLabelKey] = configValueTrue } return &corev1.PersistentVolumeClaim{ @@ -74,7 +74,7 @@ func (r *Reconciler) pvc(brokerId int32, storageIndex int, storage v1beta1.Stora fmt.Sprintf(brokerStorageTemplate, r.KafkaCluster.Name, brokerId, storageIndex), apiutil.MergeLabels( apiutil.LabelsForKafka(r.KafkaCluster.Name), - map[string]string{v1beta1.BrokerIdLabelKey: fmt.Sprintf("%d", brokerId)}, + extraLabels, ), annotations, r.KafkaCluster), Spec: pvcSpec,