From c4d1360b96b18925151c45b61691ab8c5608b340 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Mon, 15 Apr 2024 16:42:29 +0200
Subject: [PATCH] fix(dsrepl): trigger election for new ra servers
 unconditionallly

Otherwise we might end up in a situation when there's no member online
yet at the time of the election trigger, and the election will never
happen.
---
 .../src/emqx_ds_replication_layer_shard.erl   | 38 ++++++++-----------
 .../test/emqx_ds_replication_SUITE.erl        |  4 +-
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl
index f4c0d3b01..20d9ef481 100644
--- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl
@@ -341,29 +341,21 @@ start_shard(DB, Shard, #{replication_options := ReplicationOpts}) ->
                 log_init_args => LogOpts
             })
     end,
-    case Servers of
-        [LocalServer | _] ->
-            %% TODO
-            %% Not super robust, but we probably don't expect nodes to be down
-            %% when we bring up a fresh consensus group. Triggering election
-            %% is not really required otherwise.
-            %% TODO
-            %% Ensure that doing that on node restart does not disrupt consensus.
-            %% Edit: looks like it doesn't, this could actually be quite useful
-            %% to "steal" leadership from nodes that have too much leader load.
-            %% TODO
-            %% It doesn't really work that way. There's `ra:transfer_leadership/2`
-            %% for that.
-            try
-                ra:trigger_election(LocalServer, _Timeout = 1_000)
-            catch
-                %% TODO
-                %% Tolerating exceptions because server might be occupied with log
-                %% replay for a while.
-                exit:{timeout, _} when not Bootstrap ->
-                    ok
-            end;
-        _ ->
+    %% NOTE
+    %% Triggering election is necessary when a new consensus group is being brought up.
+    %% TODO
+    %% It's probably a good idea to rebalance leaders across the cluster from time to
+    %% time. There's `ra:transfer_leadership/2` for that.
+    try Bootstrap andalso ra:trigger_election(LocalServer, _Timeout = 1_000) of
+        false ->
+            ok;
+        ok ->
+            ok
+    catch
+        %% TODO
+        %% Tolerating exceptions because server might be occupied with log replay for
+        %% a while.
+        exit:{timeout, _} when not Bootstrap ->
             ok
     end.
 
diff --git a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl
index 9fc55d170..3b0e37c7f 100644
--- a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl
+++ b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl
@@ -435,8 +435,8 @@ t_rebalance_offline_restarts(Config) ->
         erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts])
     ),
     ?retry(
-        500,
-        10,
+        1000,
+        5,
         ?assertEqual([8 || _ <- Nodes], [n_shards_online(N, ?DB) || N <- Nodes])
     ),