fix(dsrepl): correctly handle ra membership change command results

Before this change, results similar to `{error, {no_more_servers_to_try,
[{error, nodedown}, {error, not_member}]}}` were considered retryable
failures, which is incorrect.
This commit is contained in:
Andrew Mayorov 2024-04-08 22:44:34 +02:00
parent 3223797ae5
commit d12e907209
No known key found for this signature in database
GPG Key ID: 2837C62ACFBFED5D
1 changed files with 22 additions and 6 deletions

View File

@ -173,13 +173,14 @@ add_local_server(DB, Shard) ->
membership => voter membership => voter
} }
end, end,
case ra:add_member(ShardServers, ServerRecord, ?MEMBERSHIP_CHANGE_TIMEOUT) of Timeout = ?MEMBERSHIP_CHANGE_TIMEOUT,
case ra_try_servers(ShardServers, fun ra:add_member/3, [ServerRecord, Timeout]) of
{ok, _, _Leader} -> {ok, _, _Leader} ->
ok; ok;
{error, already_member} -> {error, already_member} ->
ok; ok;
{error, Reason} -> Error ->
{error, recoverable, Reason} {error, recoverable, Error}
end. end.
%% @doc Remove a local server from the shard cluster and clean up on-disk data. %% @doc Remove a local server from the shard cluster and clean up on-disk data.
@ -219,13 +220,14 @@ drop_local_server(DB, Shard) ->
ok | emqx_ds:error(_Reason). ok | emqx_ds:error(_Reason).
remove_server(DB, Shard, Server) -> remove_server(DB, Shard, Server) ->
ShardServers = shard_servers(DB, Shard), ShardServers = shard_servers(DB, Shard),
case ra:remove_member(ShardServers, Server, ?MEMBERSHIP_CHANGE_TIMEOUT) of Timeout = ?MEMBERSHIP_CHANGE_TIMEOUT,
case ra_try_servers(ShardServers, fun ra:remove_member/3, [Server, Timeout]) of
{ok, _, _Leader} -> {ok, _, _Leader} ->
ok; ok;
{error, not_member} -> {error, not_member} ->
ok; ok;
{error, Reason} -> Error ->
{error, recoverable, Reason} {error, recoverable, Error}
end. end.
-spec server_info -spec server_info
@ -272,6 +274,20 @@ member_readiness(#{status := Status, voter_status := #{membership := Membership}
member_readiness(#{}) -> member_readiness(#{}) ->
unknown. unknown.
%%
ra_try_servers([Server | Rest], Fun, Args) ->
case erlang:apply(Fun, [Server | Args]) of
{ok, R, Leader} ->
{ok, R, Leader};
{error, Reason} when Reason == noproc; Reason == nodedown ->
ra_try_servers(Rest, Fun, Args);
ErrorOrTimeout ->
ErrorOrTimeout
end;
ra_try_servers([], _Fun, _Args) ->
{error, servers_unreachable}.
ra_overview(Server) -> ra_overview(Server) ->
case ra:member_overview(Server) of case ra:member_overview(Server) of
{ok, Overview, _Leader} -> {ok, Overview, _Leader} ->