From f89909f60ccbc702d16422c6222547fb39b93b6a Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Mon, 12 Feb 2024 18:45:50 +0100 Subject: [PATCH] fix(dsrepl): tolerate trigger election timeouts for existing servers --- .../src/emqx_ds_replication_layer_shard.erl | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl index c5bbff8e0..dde96fbc2 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl @@ -153,8 +153,9 @@ start_shard(DB, Shard) -> Servers = shard_servers(DB, Shard), case ra:restart_server(System, LocalServer) of ok -> - ok; + Bootstrap = false; {error, name_not_registered} -> + Bootstrap = true, ok = ra:start_server(System, #{ id => LocalServer, uid => <>, @@ -172,7 +173,17 @@ start_shard(DB, Shard) -> %% is not really required otherwise. %% TODO %% Ensure that doing that on node restart does not disrupt consensus. - ok = ra:trigger_election(LocalServer); + %% Edit: looks like it doesn't, this could actually be quite useful + %% to "steal" leadership from nodes that have too much leader load. + try + ra:trigger_election(LocalServer, _Timeout = 1_000) + catch + %% TODO + %% Tolerating exceptions because server might be occupied with log + %% replay for a while. + exit:{timeout, _} when not Bootstrap -> + ok + end; _ -> ok end,