From 6ddc72c760634482f9586e94e51896fc21bdf889 Mon Sep 17 00:00:00 2001 From: Amit Balode Date: Mon, 9 Mar 2026 06:13:32 +0530 Subject: [PATCH 1/3] HDFS-17840. Fix incorrect Nodes-in-Service count in NameNode UI StorageType stats. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The StorageType stats map maintained a nodesInService counter using increments/decrements (via StorageTypeStats.addNode / subtractNode). When nodesInService dropped to 0, the entry for that storage type was removed from the map — even when decommissioning nodes still used the storage type and still contributed capacity data. When the entry was later recreated by an addStorage call, it started fresh with nodesInService = 0. Subsequent in-service node heartbeats then performed subtract (no-op, entry was gone) followed by add (creates entry, nodesInService = 1), which was correct. But any in-service node whose subtract ran against the freshly-created entry saw nodesInService decrement past 0 to -1, and then add brought it back to 0 — so that node's in-service contribution was lost for the rest of the session. Fix: add a totalNodes counter to StorageTypeStats that tracks ALL nodes using a storage type (in-service + decommissioning + maintenance). Change the map-entry removal condition from nodesInService == 0 to totalNodes == 0. An entry is now removed only when no node of any admin state still uses that storage type, preventing the premature removal that caused the count corruption. Added TestStorageTypeStatsMap with 4 unit tests covering: - Basic add/remove correctness - Entry survival when a decommissioning node still uses the storage type - nodesInService stability after the last in-service node decommissions - Entry removal only when all nodes (including decommissioning) are gone Co-Authored-By: Claude Sonnet 4.6 --- .../server/blockmanagement/DatanodeStats.java | 2 +- .../blockmanagement/StorageTypeStats.java | 8 + .../TestStorageTypeStatsMap.java | 197 ++++++++++++++++++ 3 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestStorageTypeStatsMap.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStats.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStats.java index 5bd88b561ae91..70759a924ba89 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStats.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeStats.java @@ -221,7 +221,7 @@ private void subtractNode(StorageType storageType, StorageTypeStats storageTypeStats = storageTypeStatsMap.get(storageType); if (storageTypeStats != null) { storageTypeStats.subtractNode(node); - if (storageTypeStats.getNodesInService() == 0) { + if (storageTypeStats.getTotalNodes() == 0) { storageTypeStatsMap.remove(storageType); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageTypeStats.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageTypeStats.java index 83f18bf7804de..028035194712c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageTypeStats.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageTypeStats.java @@ -39,6 +39,7 @@ public class StorageTypeStats { private long capacityRemaining = 0L; private long blockPoolUsed = 0L; private int nodesInService = 0; + private int totalNodes = 0; private StorageType storageType; @VisibleForTesting @@ -130,6 +131,10 @@ public int getNodesInService() { return nodesInService; } + public int getTotalNodes() { + return totalNodes; + } + public int getNodesInServiceXceiverCount() { return nodesInServiceXceiverCount; } @@ -145,6 +150,7 @@ public int getNodesInServiceXceiverCount() { capacityRemaining = other.capacityRemaining; blockPoolUsed = other.blockPoolUsed; nodesInService = other.nodesInService; + totalNodes = other.totalNodes; } void addStorage(final DatanodeStorageInfo info, @@ -162,6 +168,7 @@ void addStorage(final DatanodeStorageInfo info, } void addNode(final DatanodeDescriptor node) { + totalNodes++; if (node.isInService()) { nodesInService++; nodesInServiceXceiverCount += node.getXceiverCount(); @@ -183,6 +190,7 @@ void subtractStorage(final DatanodeStorageInfo info, } void subtractNode(final DatanodeDescriptor node) { + totalNodes--; if (node.isInService()) { nodesInService--; nodesInServiceXceiverCount -= node.getXceiverCount(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestStorageTypeStatsMap.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestStorageTypeStatsMap.java new file mode 100644 index 0000000000000..91e5ed605a91e --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestStorageTypeStatsMap.java @@ -0,0 +1,197 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.blockmanagement; + +import java.util.Map; + +import org.apache.hadoop.fs.StorageType; +import org.apache.hadoop.hdfs.DFSTestUtil; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; + +/** + * Unit tests for the nodesInService count in StorageTypeStats / DatanodeStats. + * + * Regression test for HDFS-17840: StorageTypeStats.nodesInService was + * incorrectly reset to zero when the storage-type map entry was prematurely + * removed. The entry was removed whenever nodesInService reached 0, even + * when non-in-service (e.g. decommissioning) nodes still used the storage + * type. Subsequent in-service heartbeats then started from a fresh entry, + * under-counting the real number of in-service nodes. + * + * The fix tracks totalNodes (all nodes, regardless of admin state) and only + * removes the entry when totalNodes == 0. + */ +public class TestStorageTypeStatsMap { + + private DatanodeStats stats; + + @BeforeEach + public void setUp() { + stats = new DatanodeStats(); + } + + /** + * Creates a DatanodeStorageInfo with the given StorageType and returns its + * owning DatanodeDescriptor (which starts in the NORMAL / in-service state). + */ + private static DatanodeDescriptor makeNode(String id, StorageType type) { + DatanodeStorageInfo si = DFSTestUtil.createDatanodeStorageInfo( + id, "127.0.0." + id.hashCode() % 200, "/rack", "host-" + id, type, null); + return si.getDatanodeDescriptor(); + } + + /** Return the StorageTypeStats for the given type, or null if absent. */ + private StorageTypeStats getStats(StorageType type) { + Map m = stats.getStatsMap(); + return m.get(type); + } + + /** + * Basic smoke test: two in-service nodes, both using DISK. + * After adding both, nodesInService should be 2. + * After removing one, it should be 1. + */ + @Test + public void testBasicAddRemove() { + DatanodeDescriptor a = makeNode("a", StorageType.DISK); + DatanodeDescriptor b = makeNode("b", StorageType.DISK); + + stats.add(a); + stats.add(b); + assertNotNull(getStats(StorageType.DISK)); + assertEquals(2, getStats(StorageType.DISK).getNodesInService()); + + stats.subtract(b); + assertNotNull(getStats(StorageType.DISK)); + assertEquals(1, getStats(StorageType.DISK).getNodesInService()); + + stats.subtract(a); + assertNull(getStats(StorageType.DISK), + "Entry should be removed when no nodes remain"); + } + + /** + * HDFS-17840 regression: when one of two in-service nodes transitions to + * decommission-in-progress, the entry must NOT be removed, and + * nodesInService must remain correct for the still-in-service node. + */ + @Test + public void testEntryNotRemovedWhenDecommissioningNodeRemains() { + DatanodeDescriptor a = makeNode("aa", StorageType.DISK); + DatanodeDescriptor b = makeNode("bb", StorageType.DISK); + + stats.add(a); + stats.add(b); + assertEquals(2, getStats(StorageType.DISK).getNodesInService()); + + // Simulate A transitioning to decommission-in-progress (as HeartbeatManager + // does in startDecommission): subtract, change state, add. + stats.subtract(a); + a.startDecommission(); // adminState -> DECOMMISSION_INPROGRESS + stats.add(a); + + // The DISK entry must still exist and B must still be counted. + assertNotNull(getStats(StorageType.DISK), + "DISK entry must survive when a decommissioning node still uses it"); + assertEquals(1, getStats(StorageType.DISK).getNodesInService(), + "Only the in-service node B should be counted"); + + // Now B heartbeats (subtract + add with unchanged state). + stats.subtract(b); + stats.add(b); + assertEquals(1, getStats(StorageType.DISK).getNodesInService(), + "nodesInService must remain 1 after B's heartbeat"); + } + + /** + * HDFS-17840 regression: when the last in-service node transitions to + * decommissioning (nodesInService drops to 0), the entry must still NOT be + * removed as long as that decommissioning node is alive. A subsequent + * in-service node's heartbeat must then see the correct count (1, not 0). + */ + @Test + public void testEntryNotRemovedWhenLastInServiceDecommissions() { + DatanodeDescriptor a = makeNode("a1", StorageType.DISK); + DatanodeDescriptor b = makeNode("b1", StorageType.DISK); + + stats.add(a); + stats.add(b); + assertEquals(2, getStats(StorageType.DISK).getNodesInService()); + + // Decommission A. + stats.subtract(a); + a.startDecommission(); + stats.add(a); + assertEquals(1, getStats(StorageType.DISK).getNodesInService()); + + // Decommission B (last in-service node). + stats.subtract(b); + b.startDecommission(); + stats.add(b); + + // nodesInService == 0, but A and B are still alive and using DISK. + assertNotNull(getStats(StorageType.DISK), + "DISK entry must survive when decommissioning nodes still use it"); + assertEquals(0, getStats(StorageType.DISK).getNodesInService()); + + // Now a new node C registers (in-service). + DatanodeDescriptor c = makeNode("c1", StorageType.DISK); + stats.add(c); + assertEquals(1, getStats(StorageType.DISK).getNodesInService(), + "New in-service node must be counted correctly"); + + // C heartbeats. + stats.subtract(c); + stats.add(c); + assertEquals(1, getStats(StorageType.DISK).getNodesInService(), + "nodesInService must stay 1 after C's heartbeat"); + } + + /** + * Entry should be removed only after all nodes (in-service and + * decommissioning) are fully removed via removeDatanode (subtract only). + */ + @Test + public void testEntryRemovedOnlyWhenAllNodesGone() { + DatanodeDescriptor a = makeNode("x1", StorageType.DISK); + DatanodeDescriptor b = makeNode("x2", StorageType.DISK); + + stats.add(a); + stats.add(b); + + // Decommission A. + stats.subtract(a); + a.startDecommission(); + stats.add(a); + + // Remove B (simulating removeDatanode). + stats.subtract(b); + assertNotNull(getStats(StorageType.DISK), + "DISK entry must remain because A is still using it"); + + // Remove A too. + stats.subtract(a); + assertNull(getStats(StorageType.DISK), + "DISK entry must be removed once no nodes use it"); + } +} From 2f56b9c1ed6136151b4d8d6c2b11c5c03f2baaab Mon Sep 17 00:00:00 2001 From: Amit Balode Date: Thu, 19 Mar 2026 17:05:16 +0530 Subject: [PATCH 2/3] retrigger CI From e5f806db08071b384a2ccebb2808d049c23bcd56 Mon Sep 17 00:00:00 2001 From: Amit Balode Date: Fri, 20 Mar 2026 17:13:50 +0530 Subject: [PATCH 3/3] retrigger CI