From 1b86c58a05f056c4a03db13a85daf5cdea327a63 Mon Sep 17 00:00:00 2001
From: Heikki Nousiainen <htn@aiven.io>
Date: Mon, 4 Nov 2024 12:16:32 +0200
Subject: [PATCH] pglookout: support explicit failover priorities

Support explicit prioritization between instances. This can be
configured via ``failover_priorities`` key, and will be consulted
upon picking up the standby that should do the promotion in cases
where multiple nodes have a matching replication position.

Previously, and also as the current default, the selection was based
on the sorting order of the remote nodes.

The configuration option allows some additional flexibility, and
supports e.g. topologies where we have more favorable and less
desirable standbys in multiple different network locations.
---
 README.rst             |  8 +++++++
 pglookout/pglookout.py | 23 +++++++++++-------
 test/test_lookout.py   | 53 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index fb9cd39..57322a7 100644
--- a/README.rst
+++ b/README.rst
@@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file.
 
 Shell command to execute in case the node has deemed itself in need of promotion
 
+``failover_priorities`` (default ``{}``)
+
+Define priority of nodes for promotion, in case there are multiple candidates
+with the same replication position.  This allows to ensure all pglookout instances
+would elect the same standby for promotion, while still allowing for topologies
+with e.g. less preferred standbys in secondary network locations. By default,
+pglookout uses remote connection ids for the same selection purpose.
+
 ``known_gone_nodes`` (default ``[]``)
 
 Lists nodes that are explicitly known to have left the cluster.  If the old
diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
index 42fbf3a..a625cbd 100755
--- a/pglookout/pglookout.py
+++ b/pglookout/pglookout.py
@@ -643,14 +643,21 @@ def do_failover_decision(self, standby_nodes):
         if not known_replication_positions:
             self.log.warning("No known replication positions, canceling failover consideration")
             return
-        # If there are multiple nodes with the same replication positions pick the one with the "highest" name
-        # to make sure pglookouts running on all standbys make the same decision.  The rationale for picking
-        # the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is
-        # "best" beyond looking at replication positions, but picking the highest id supports environments
-        # where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to
-        # promote the latest and greatest node.  In static environments node identifiers can be priority
-        # numbers, with the highest number being the one that should be preferred.
-        furthest_along_instance = max(known_replication_positions[max(known_replication_positions)])
+
+        # Find the instance that is furthest along.
+        # If there are multiple nodes with the same replication positions, try to identify one to promote either
+        # via explicit failover priority configuration or pick the one with the "highest" name.
+        # The rationale of this logic is to ensure all participating pglookouts running on all standbys make
+        # the same decision. The "highest" name works well in environments where nodes are assigned identifiers
+        # from an incrementing sequence and where we want to promote the latest and greatest node.
+        def _priority_or_id(instance):
+            priority = self.config.get("failover_priorities", {}).get(instance)
+            if priority is None:
+                priority = instance
+            return priority
+
+        furthest_along_instances = known_replication_positions[max(known_replication_positions)]
+        furthest_along_instance = sorted(furthest_along_instances, key=_priority_or_id, reverse=True)[0]
         self.log.warning(
             "Node that is furthest along is: %r, all replication positions were: %r",
             furthest_along_instance,
diff --git a/test/test_lookout.py b/test/test_lookout.py
index 1720c8e..c8d6a78 100644
--- a/test/test_lookout.py
+++ b/test/test_lookout.py
@@ -1005,6 +1005,59 @@ def test_standbys_failover_equal_replication_positions(pgl):
     assert pgl.execute_external_command.call_count == 1
 
 
+def test_standbys_failover_equal_replication_positions_with_priorities(pgl):
+    now = datetime.datetime.utcnow()
+    _set_instance_cluster_state(
+        pgl,
+        instance="192.168.54.183",
+        pg_last_xlog_receive_location="0/70004D8",
+        pg_is_in_recovery=True,
+        connection=True,
+        replication_time_lag=400.435871,
+        fetch_time=now,
+        db_time=now,
+        conn_info="foobar",
+    )
+    _set_instance_cluster_state(
+        pgl,
+        instance="192.168.57.180",
+        pg_last_xlog_receive_location=None,
+        pg_is_in_recovery=False,
+        connection=False,
+        replication_time_lag=0.0,
+        fetch_time=now - datetime.timedelta(seconds=3600),
+        db_time=now - datetime.timedelta(seconds=3600),
+        conn_info="foobar",
+    )
+    _set_instance_cluster_state(
+        pgl,
+        instance="192.168.63.4",
+        pg_last_xlog_receive_location="0/70004D8",
+        pg_is_in_recovery=True,
+        connection=True,
+        replication_time_lag=401.104655,
+        fetch_time=now,
+        db_time=now,
+        conn_info="foobar",
+    )
+
+    pgl.current_master = "192.168.57.180"
+
+    pgl.config["failover_priorities"] = {
+        "192.168.54.183": 1000,
+        "192.168.63.4": 0,
+    }
+
+    # This is highest by instance, but lower in priority
+    pgl.own_db = "192.168.63.4"
+    pgl.check_cluster_state()
+    assert pgl.execute_external_command.call_count == 0
+    # Lower by instance, but higher in priority
+    pgl.own_db = "192.168.54.183"
+    pgl.check_cluster_state()
+    assert pgl.execute_external_command.call_count == 1
+
+
 def test_node_map_when_only_observer_sees_master(pgl):
     cluster_state = {
         "10.255.255.10": {