From 2fa4f4972e4af2ea8d1896a0fa4d7f82eb6b9940 Mon Sep 17 00:00:00 2001 From: Nial <48334675+nmcc1212@users.noreply.github.com> Date: Tue, 24 Feb 2026 16:58:42 +0000 Subject: [PATCH] monitor: fix report_lsn deadlock when quorum nodes are permanently unreachable Fixes #1113 --- src/monitor/group_state_machine.c | 68 ++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/src/monitor/group_state_machine.c b/src/monitor/group_state_machine.c index cf2b3e4e4..e1209bf8f 100644 --- a/src/monitor/group_state_machine.c +++ b/src/monitor/group_state_machine.c @@ -48,6 +48,7 @@ typedef struct CandidateList int candidateCount; int quorumCandidateCount; int missingNodesCount; + int deadMissingNodesCount; } CandidateList; @@ -1374,6 +1375,64 @@ ProceedGroupStateForMSFailover(AutoFailoverNode *activeNode, return false; } + /* + * Some replication-quorum nodes are unreachable (unhealthy and not + * reporting) and never reported their LSN. We cannot recover their + * LSN, but we can still proceed safely when the number of such dead + * nodes is strictly less than number_sync_standbys. + * + * Reasoning (pigeonhole principle): if number_sync_standbys = S, then + * every commit that was ever acknowledged to a client was received by at + * least S standbys. If fewer than S quorum nodes are missing, at least + * one of the alive candidates must have received the most recent commit, + * so promoting the most-advanced alive node cannot cause data loss beyond + * what has already been made unavoidable by the failures. + * + * When number_sync_standbys = 0 no synchronous-replication guarantee was + * in effect, so dead nodes never blocked data-safety and we always + * proceed. + */ + if (candidateList.deadMissingNodesCount > 0) + { + if (formation->number_sync_standbys > 0 && + candidateList.deadMissingNodesCount >= formation->number_sync_standbys) + { + char message[BUFSIZE] = { 0 }; + + LogAndNotifyMessage( + message, BUFSIZE, + "Failover still in progress after %d nodes reported their LSN " + "and %d unreachable nodes did not report; " + "manual intervention (pg_autoctl drop node) is required because " + "the number of unreachable quorum nodes (%d) is not less than " + "number_sync_standbys (%d), so data-safe automatic failover " + "cannot be guaranteed; " + "activeNode is " NODE_FORMAT + " and reported state \"%s\"", + candidateList.candidateCount, + candidateList.deadMissingNodesCount, + candidateList.deadMissingNodesCount, + formation->number_sync_standbys, + NODE_FORMAT_ARGS(activeNode), + ReplicationStateGetName(activeNode->reportedState)); + + return false; + } + + /* + * deadMissingNodesCount < number_sync_standbys (or + * number_sync_standbys == 0): safe to proceed. + */ + elog(LOG, + "Proceeding with failover despite %d unreachable quorum node(s) " + "not having reported their LSN: " + "deadMissingNodesCount (%d) < number_sync_standbys (%d), " + "so at least one alive candidate holds the most recent LSN", + candidateList.deadMissingNodesCount, + candidateList.deadMissingNodesCount, + formation->number_sync_standbys); + } + /* * So all the expected candidates did report their LSN, no node is missing. * Let's see about selecting a candidate for failover now, when we do have @@ -1569,11 +1628,18 @@ BuildCandidateList(List *nodesGroupList, CandidateList *candidateList) * Only the nodes that participate in the quorum are required to * report their LSN, because only those nodes are waited by * Postgres to report a commit to the client connection. + * + * Count these unreachable quorum nodes separately from + * missingNodesCount (which tracks nodes that are alive but have + * not reported yet). This allows the caller to decide whether it + * is safe to proceed using the pigeonhole principle: if + * deadMissingNodesCount < number_sync_standbys, then at least one + * alive candidate is guaranteed to hold the most recent LSN. */ if (node->replicationQuorum && node->reportedState != REPLICATION_STATE_REPORT_LSN) { - ++(candidateList->missingNodesCount); + ++(candidateList->deadMissingNodesCount); } continue;