Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion observer-operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ docker-buildx: test ## Build and push docker image for the manager for cross-pla
sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross
- docker buildx create --name project-v3-builder
docker buildx use project-v3-builder
- docker buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross
- docker buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross .
- docker buildx rm project-v3-builder
rm Dockerfile.cross

Expand Down
22 changes: 22 additions & 0 deletions observer-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,28 @@ The commands for creating the domain and API:

operator-sdk create api --group sdi --version v1alpha1 --kind SDIObserver --resource --controller

## Troubleshooting

### TLS Handshake Errors After vsystem Pod Replacement

After the vsystem pod is replaced (due to upgrade, restart, or redeployment), you may see TLS handshake errors in the vsystem logs:

```
vSystem main server http: TLS handshake error from 10.x.x.x: local error: tls: bad record MAC
```

**Cause**: The OpenShift router caches TLS session data. When the vsystem pod is replaced with new certificates, the router's cached sessions become invalid.

**Resolution**: Delete the vsystem route and let the operator recreate it:

```sh
oc delete route vsystem -n <sdi-namespace>
```

The operator will automatically recreate the route (if `sdiVSystemRoute.managementState` is set to `Managed`), and the TLS errors will stop.

**Note**: This issue is transient and will also resolve itself after a few minutes as the router's TLS session cache expires.

## License

Copyright 2023.
Expand Down
22 changes: 13 additions & 9 deletions observer-operator/controllers/sdiobserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,8 @@ type SDIObserverReconciler struct {
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.13.0/pkg/reconcile
func (r *SDIObserverReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx).WithValues(
"sdiobserver", req.NamespacedName,
"namespace", req.Namespace,
"name", req.Name,
)
// controller-runtime automatically adds controller, namespace, name, and reconcileID to the context
logger := log.FromContext(ctx)

operatorCR := &sdiv1alpha1.SDIObserver{}
err := r.Get(ctx, req.NamespacedName, operatorCR)
Expand Down Expand Up @@ -178,6 +175,8 @@ func (r *SDIObserverReconciler) ensureStatusConditions(cr *sdiv1alpha1.SDIObserv
}

func (r *SDIObserverReconciler) handleError(ctx context.Context, cr *sdiv1alpha1.SDIObserver, err error, msg string) (ctrl.Result, error) {
logger := log.FromContext(ctx)

meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{
Type: sdiv1alpha1.ConditionTypeDegraded,
Status: metav1.ConditionTrue,
Expand All @@ -186,11 +185,16 @@ func (r *SDIObserverReconciler) handleError(ctx context.Context, cr *sdiv1alpha1
Message: msg,
})
if updateErr := r.Status().Update(ctx, cr); updateErr != nil {
return ctrl.Result{RequeueAfter: 1 * time.Minute}, utilerrors.NewAggregate([]error{err, updateErr})
// Return error to trigger exponential backoff when we can't even update status
logger.Error(utilerrors.NewAggregate([]error{err, updateErr}), "Failed to update status after error")
return ctrl.Result{}, utilerrors.NewAggregate([]error{err, updateErr})
}
logger := log.FromContext(ctx)
logger.Error(err, "Returning error with RequeueAfter", "RequeueAfter", 1*time.Minute)
return ctrl.Result{RequeueAfter: 1 * time.Minute}, err

// Log the error but return only Result (no error) to use fixed requeue interval
// instead of exponential backoff. This ensures we retry within a predictable time
// when components become available.
logger.Error(err, "Reconciliation failed, will retry", "retryAfter", r.Interval)
return ctrl.Result{RequeueAfter: r.Interval}, nil
}

// ConditionHolder is a helper struct to hold conditions
Expand Down
24 changes: 12 additions & 12 deletions observer-operator/controllers/sdiobserver_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestSDIObserverReconciler_Reconcile_Success(t *testing.T) {
},
}

client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(obs).Build()
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(obs).WithStatusSubresource(obs).Build()

reconciler := &SDIObserverReconciler{
Client: client,
Expand All @@ -90,10 +90,10 @@ func TestSDIObserverReconciler_Reconcile_Success(t *testing.T) {
ctx := context.Background()
result, err := reconciler.Reconcile(ctx, req)

// We expect an error here because the test doesn't set up all the required resources
// But we're testing that the reconciler doesn't panic and handles the error gracefully
if err == nil {
t.Log("No error occurred - this might indicate missing test setup or successful reconciliation")
// We don't expect an error because handleError now returns nil error
// to use fixed requeue interval instead of exponential backoff
if err != nil {
t.Logf("Unexpected error: %v", err)
}

// The result should include a requeue after interval
Expand Down Expand Up @@ -164,7 +164,7 @@ func TestSDIObserverReconciler_handleError(t *testing.T) {
},
}

client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(obs).Build()
client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(obs).WithStatusSubresource(obs).Build()

reconciler := &SDIObserverReconciler{
Client: client,
Expand All @@ -177,17 +177,17 @@ func TestSDIObserverReconciler_handleError(t *testing.T) {
testError := &TestError{message: "test error"}
result, err := reconciler.handleError(ctx, obs, testError, "Test error message")

// The handleError method may return an aggregate error including update errors
if err == nil {
t.Error("Expected an error to be returned")
// handleError returns nil error to use fixed requeue interval (avoids exponential backoff)
// This ensures predictable retry timing when components become available
if err != nil {
t.Errorf("Expected no error (to avoid exponential backoff), got %v", err)
}

if result.RequeueAfter != 1*time.Minute {
t.Error("Expected requeue after 1 minute")
t.Errorf("Expected requeue after 1 minute, got %v", result.RequeueAfter)
}

// Check that the status condition was set (may not persist due to fake client limitations)
// This is mainly testing that handleError doesn't panic and sets the appropriate requeue
// Check that the status condition was set
if len(obs.Status.Conditions) > 0 {
t.Log("Status conditions were set as expected")
}
Expand Down
40 changes: 38 additions & 2 deletions observer-operator/pkg/adjuster/adjuster.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import (
"context"

"github.com/go-logr/logr"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -36,6 +38,9 @@ func New(name, namespace string, c client.Client, scheme *runtime.Scheme, logger
}

// Adjust performs a series of adjustments using the provided Actioner.
// It continues processing all adjustments even if some fail, collecting errors.
// NotFound errors are logged and skipped (component not yet available).
// Other errors are aggregated and returned at the end.
func (a *Adjuster) Adjust(ac Actioner, ctx context.Context) error {
// List of adjustment functions with their corresponding log messages
adjustments := []struct {
Expand All @@ -49,13 +54,44 @@ func (a *Adjuster) Adjust(ac Actioner, ctx context.Context) error {
{"SDI network", func() error { return ac.AdjustSDINetwork(a, ctx) }},
}

var errs []error
for _, adjustment := range adjustments {
if err := adjustment.action(); err != nil {
return err
if isNotFoundError(err) {
a.logger.Info("Component not found, skipping adjustment", "adjustment", adjustment.name, "error", err.Error())
continue
}
a.logger.Error(err, "Adjustment failed", "adjustment", adjustment.name)
errs = append(errs, err)
}
}

return nil
return utilerrors.NewAggregate(errs)
}

// isNotFoundError checks if an error (or all errors in an aggregate) are NotFound errors.
// This handles both single errors and aggregate errors from utilerrors.NewAggregate.
func isNotFoundError(err error) bool {
if err == nil {
return false
}

// Check if it's a single NotFound error
if errors.IsNotFound(err) {
return true
}

// Check if it's an aggregate where ALL errors are NotFound
if agg, ok := err.(utilerrors.Aggregate); ok {
for _, e := range agg.Errors() {
if !isNotFoundError(e) {
return false
}
}
return len(agg.Errors()) > 0
}

return false
}

// Logger returns the logger instance associated with the Adjuster.
Expand Down
Loading
Loading