fix(dsrepl): tolerate trigger election timeouts for existing servers

2024-02-12 18:45:50 +01:00 · 2024-02-12 18:45:50 +01:00 · f89909f60c
parent 6c6ea50e42
commit f89909f60c
1 changed files with 13 additions and 2 deletions
--- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl
+++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl
@ -153,8 +153,9 @@ start_shard(DB, Shard) ->
    Servers = shard_servers(DB, Shard),
    case ra:restart_server(System, LocalServer) of
        ok ->
-            ok;
+            Bootstrap = false;
        {error, name_not_registered} ->
+            Bootstrap = true,
            ok = ra:start_server(System, #{
                id => LocalServer,
                uid => <<ClusterName/binary, "_", Site/binary>>,
@ -172,7 +173,17 @@ start_shard(DB, Shard) ->
            %% is not really required otherwise.
            %% TODO
            %% Ensure that doing that on node restart does not disrupt consensus.
-            ok = ra:trigger_election(LocalServer);
+            %% Edit: looks like it doesn't, this could actually be quite useful
+            %% to "steal" leadership from nodes that have too much leader load.
+            try
+                ra:trigger_election(LocalServer, _Timeout = 1_000)
+            catch
+                %% TODO
+                %% Tolerating exceptions because server might be occupied with log
+                %% replay for a while.
+                exit:{timeout, _} when not Bootstrap ->
+                    ok
+            end;
        _ ->
            ok
    end,