fix(dsrepl): attempt leadership transfer before server removal

This should make it much less likely to hit weird edge cases that lead
to duplicate Raft log entries because of client retries upon receiving
`shutdown` from the leader being removed.
This commit is contained in:
Andrew Mayorov 2024-04-08 21:28:20 +02:00
parent 1e95bd4da6
commit 3223797ae5
No known key found for this signature in database
GPG Key ID: 2837C62ACFBFED5D
1 changed files with 43 additions and 0 deletions

View File

@ -114,6 +114,13 @@ get_server_local_preferred(DB, Shard) ->
pick_random(get_shard_servers(DB, Shard))
end.
lookup_leader(DB, Shard) ->
%% NOTE
%% Does not block, but the result may be outdated or even unknown when there's
%% no servers on the local node.
ClusterName = get_cluster_name(DB, Shard),
ra_leaderboard:lookup_leader(ClusterName).
pick_local(Servers) ->
case lists:keyfind(node(), 2, Servers) of
Local when is_tuple(Local) ->
@ -181,7 +188,22 @@ add_local_server(DB, Shard) ->
-spec drop_local_server(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) ->
ok | emqx_ds:error(_Reason).
drop_local_server(DB, Shard) ->
ShardServers = shard_servers(DB, Shard),
LocalServer = local_server(DB, Shard),
case lookup_leader(DB, Shard) of
LocalServer ->
%% NOTE
%% Trigger leadership transfer *and* force to wait until the new leader
%% is elected and updated in the leaderboard. This should help to avoid
%% edge cases where entries appended right before removal are duplicated
%% due to client retries.
%% Timeouts are ignored, it's a best effort attempt.
[Candidate | _] = lists:delete(LocalServer, ShardServers),
_ = ra:transfer_leadership(LocalServer, Candidate),
_ = wait_until(fun() -> lookup_leader(DB, Shard) == Candidate end);
_Another ->
ok
end,
case remove_server(DB, Shard, LocalServer) of
ok ->
ra:force_delete_server(DB, LocalServer);
@ -351,3 +373,24 @@ memoize(Fun, Args) ->
Result ->
Result
end.
wait_until(Fun) ->
wait_until(Fun, 5_000, 250).
wait_until(Fun, Timeout, Sleep) ->
Deadline = erlang:monotonic_time(millisecond) + Timeout,
loop_until(Fun, Deadline, Sleep).
loop_until(Fun, Deadline, Sleep) ->
case Fun() of
true ->
ok;
false ->
case erlang:monotonic_time(millisecond) of
Now when Now < Deadline ->
timer:sleep(Sleep),
loop_until(Fun, Deadline, Sleep);
_ ->
timeout
end
end.