From dc942e11333cf86ff64277a742e3ef5d57bb01f7 Mon Sep 17 00:00:00 2001 From: Austin Wheeler Date: Mon, 2 Mar 2026 17:36:43 -0800 Subject: [PATCH 1/2] fix: add node health check to incr/decr before mutate operation incr/decr operations bypassed the ensureWriteQueueSize check that other write paths (set, add, touch) use, causing them to block on unreachable or stalled nodes for the full mutateOperationTimeout duration. A single unhealthy node could cause all incr/decr calls to block, exhausting the caller's thread pool. This adds the same ensureWriteQueueSize gate to incr/decr. If the target node is inactive or its write queue is full, the operation returns -1 immediately, consistent with the documented API contract. The existing reconciliation logic in EVCacheImpl.incr()/decr() already handles -1 as a zone failure and will sync the value from healthy zones on the next successful operation. --- .../main/java/com/netflix/evcache/pool/EVCacheClient.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java b/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java index 3d54e497..1203327b 100644 --- a/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java +++ b/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java @@ -847,10 +847,18 @@ public Map getAllChunks(String key) throws EVCacheReadQueueE } public long incr(String key, long by, long defaultVal, int timeToLive) throws EVCacheException { + final MemcachedNode node = evcacheMemcachedClient.getEVCacheNode(key); + if (!ensureWriteQueueSize(node, key, Call.INCR)) { + return -1; + } return evcacheMemcachedClient.incr(key, by, defaultVal, timeToLive); } public long decr(String key, long by, long defaultVal, int timeToLive) throws EVCacheException { + final MemcachedNode node = evcacheMemcachedClient.getEVCacheNode(key); + if (!ensureWriteQueueSize(node, key, Call.DECR)) { + return -1; + } return evcacheMemcachedClient.decr(key, by, defaultVal, timeToLive); } From 75adcd203ccaa0d583b6bf44adfaab27776f6120 Mon Sep 17 00:00:00 2001 From: Austin Wheeler Date: Fri, 6 Mar 2026 15:49:17 -0800 Subject: [PATCH 2/2] fix: decr reconciliation picks min value; fast-fail inactive nodes in write path - EVCacheImpl.decr: fix reconciliation to pick the minimum non-(-1) value across nodes instead of the maximum. For decr, the most up-to-date node has the lowest value (most decremented), so the old max-pick logic would overwrite correctly decremented nodes with stale higher values. - EVCacheClient.ensureWriteQueueSize: add isAvailable() check before entering the retry/sleep loop. This fast-fails writes to inactive nodes instead of blocking request threads through 3 retry iterations. Affects all write operations that go through ensureWriteQueueSize (incr, decr, set, delete, etc.). --- .../src/main/java/com/netflix/evcache/EVCacheImpl.java | 2 +- .../main/java/com/netflix/evcache/pool/EVCacheClient.java | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/evcache-core/src/main/java/com/netflix/evcache/EVCacheImpl.java b/evcache-core/src/main/java/com/netflix/evcache/EVCacheImpl.java index 711ea90f..6817dd09 100644 --- a/evcache-core/src/main/java/com/netflix/evcache/EVCacheImpl.java +++ b/evcache-core/src/main/java/com/netflix/evcache/EVCacheImpl.java @@ -2961,7 +2961,7 @@ public long decr(String key, long by, long defaultVal, int timeToLive) throws EV int index = 0; for (EVCacheClient client : clients) { vals[index] = client.decr(evcKey.getDerivedKey(client.isDuetClient(), client.getHashingAlgorithm(), client.shouldEncodeHashKey(), client.getMaxDigestBytes(), client.getMaxHashLength(), client.getBaseEncoder()), by, defaultVal, timeToLive); - if (vals[index] != -1 && currentValue < vals[index]) { + if (vals[index] != -1 && (currentValue == -1 || vals[index] < currentValue)) { currentValue = vals[index]; if (log.isDebugEnabled()) log.debug("DECR : APP " + _appName + " current value = " + currentValue + " for key : " + key + " from client : " + client); } diff --git a/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java b/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java index 1203327b..6f0fbd74 100644 --- a/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java +++ b/evcache-core/src/main/java/com/netflix/evcache/pool/EVCacheClient.java @@ -270,6 +270,12 @@ public void reportWrongKeyReturned() { private boolean ensureWriteQueueSize(MemcachedNode node, String key, EVCache.Call call) throws EVCacheException { if (node instanceof EVCacheNode) { final EVCacheNode evcNode = (EVCacheNode) node; + if (!evcNode.isAvailable(call)) { + incrementFailure(EVCacheMetricsFactory.INACTIVE_NODE, call); + if (log.isDebugEnabled()) log.debug("Inactive Node " + evcNode + " on " + call + " operation for app : " + appName + + "; zone : " + zone + "; key : " + key); + return false; + } int i = 0; while (true) { final int size = evcNode.getWriteQueueSize();