fix(dsrepl): trigger election for new ra servers unconditionallly

Otherwise we might end up in a situation when there's no member online
yet at the time of the election trigger, and the election will never
happen.
This commit is contained in:
Andrew Mayorov 2024-04-15 16:42:29 +02:00
parent 34be2ea9a0
commit c4d1360b96
No known key found for this signature in database
GPG Key ID: 2837C62ACFBFED5D
2 changed files with 17 additions and 25 deletions

View File

@ -341,29 +341,21 @@ start_shard(DB, Shard, #{replication_options := ReplicationOpts}) ->
log_init_args => LogOpts log_init_args => LogOpts
}) })
end, end,
case Servers of %% NOTE
[LocalServer | _] -> %% Triggering election is necessary when a new consensus group is being brought up.
%% TODO %% TODO
%% Not super robust, but we probably don't expect nodes to be down %% It's probably a good idea to rebalance leaders across the cluster from time to
%% when we bring up a fresh consensus group. Triggering election %% time. There's `ra:transfer_leadership/2` for that.
%% is not really required otherwise. try Bootstrap andalso ra:trigger_election(LocalServer, _Timeout = 1_000) of
%% TODO false ->
%% Ensure that doing that on node restart does not disrupt consensus. ok;
%% Edit: looks like it doesn't, this could actually be quite useful ok ->
%% to "steal" leadership from nodes that have too much leader load. ok
%% TODO catch
%% It doesn't really work that way. There's `ra:transfer_leadership/2` %% TODO
%% for that. %% Tolerating exceptions because server might be occupied with log replay for
try %% a while.
ra:trigger_election(LocalServer, _Timeout = 1_000) exit:{timeout, _} when not Bootstrap ->
catch
%% TODO
%% Tolerating exceptions because server might be occupied with log
%% replay for a while.
exit:{timeout, _} when not Bootstrap ->
ok
end;
_ ->
ok ok
end. end.

View File

@ -435,8 +435,8 @@ t_rebalance_offline_restarts(Config) ->
erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts]) erpc:multicall(Nodes, emqx_ds, open_db, [?DB, Opts])
), ),
?retry( ?retry(
500, 1000,
10, 5,
?assertEqual([8 || _ <- Nodes], [n_shards_online(N, ?DB) || N <- Nodes]) ?assertEqual([8 || _ <- Nodes], [n_shards_online(N, ?DB) || N <- Nodes])
), ),