From c4d1360b96b18925151c45b61691ab8c5608b340 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 15 Apr 2024 16:42:29 +0200 Subject: [PATCH] fix(dsrepl): trigger election for new ra servers unconditionallly Otherwise we might end up in a situation when there's no member online yet at the time of the election trigger, and the election will never happen. --- .../src/emqx_ds_replication_layer_shard.erl | 38 ++++++++----------- .../test/emqx_ds_replication_SUITE.erl | 4 +- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl index f4c0d3b01..20d9ef481 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl @@ -341,29 +341,21 @@ start_shard(DB, Shard, #{replication_options := ReplicationOpts}) -> log_init_args => LogOpts }) end, - case Servers of - [LocalServer | _] -> - %% TODO - %% Not super robust, but we probably don't expect nodes to be down - %% when we bring up a fresh consensus group. Triggering election - %% is not really required otherwise. - %% TODO - %% Ensure that doing that on node restart does not disrupt consensus. - %% Edit: looks like it doesn't, this could actually be quite useful - %% to "steal" leadership from nodes that have too much leader load. - %% TODO - %% It doesn't really work that way. There's `ra:transfer_leadership/2` - %% for that. - try - ra:trigger_election(LocalServer, _Timeout = 1_000) - catch - %% TODO - %% Tolerating exceptions because server might be occupied with log - %% replay for a while. - exit:{timeout, _} when not Bootstrap -> - ok - end; - _ -> + %% NOTE + %% Triggering election is necessary when a new consensus group is being brought up. + %% TODO + %% It's probably a good idea to rebalance leaders across the cluster from time to + %% time. There's `ra:transfer_leadership/2` for that. + try Bootstrap andalso ra:trigger_election(LocalServer, _Timeout = 1_000) of + false -> + ok; + ok -> + ok + catch + %% TODO + %% Tolerating exceptions because server might be occupied with log replay for + %% a while. + exit:{timeout, _} when not Bootstrap -> ok end. diff --git a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl index 9fc55d170..3b0e37c7f 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl @@ -435,8 +435,8 @@ t_rebalance_offline_restarts(Config) -> erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) ), ?retry( - 500, - 10, + 1000, + 5, ?assertEqual([8 || _ <- Nodes], [n_shards_online(N, ?DB) || N <- Nodes]) ),