From 963b0c8a6ca15747681a507653b9a55bceb55189 Mon Sep 17 00:00:00 2001 From: Kc Balusu Date: Sun, 7 Jun 2026 12:16:12 -0700 Subject: [PATCH] ring: treat CAS no-change as success during instance registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the KV store returns "no change detected" during instance registration, the ring entry already matches what the lifecycler tried to write. This is benign — but the current code treats it as a fatal error, which causes a permanent CrashLoopBackOff when the stored timestamp is ahead of the current time (e.g., due to clock corruption). In this scenario, the merge function never sees forward progress because time.Now() < stored_timestamp, so every CAS retry exhausts and the error surfaces as: register instance in the ring: failed to CAS-update key ...: no change detected The ruler (or any other ring member) then fails to start on every restart. Handle this gracefully: if the CAS error contains "no change detected" during registration, log a warning and proceed. The instance is already registered with the desired state; there is nothing to fix. Fixes grafana/loki#21733 --- ring/basic_lifecycler.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ring/basic_lifecycler.go b/ring/basic_lifecycler.go index 0a6cda30d..d90b0c1ee 100644 --- a/ring/basic_lifecycler.go +++ b/ring/basic_lifecycler.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" "sort" + "strings" "sync" "time" @@ -353,7 +354,17 @@ func (l *BasicLifecycler) registerInstance(ctx context.Context) error { }) if err != nil { - return err + // If the KV store detected no change (the instance's ring entry already matches + // what we tried to write), the registration is effectively successful. This can + // happen when the stored timestamp is ahead of the current time (e.g., clock + // corruption) — the merge function sees no forward progress and returns + // "no change detected." Treating this as fatal causes a permanent CrashLoopBackOff + // because the same no-change result recurs on every restart (issue grafana/loki#21733). + if strings.Contains(err.Error(), "no change detected") { + level.Warn(l.logger).Log("msg", "CAS detected no change during registration; instance already registered with identical state", "ring", l.ringName, "instance", l.cfg.ID, "err", err) + } else { + return err + } } l.currState.Lock()