fix(dsrepl): retry only `{add, Site}` crashed membership transitions
To minimize the potential negative impact of removal transitions that crash for some unknown and unusual reasons.
This commit is contained in:
parent
4c0cc079c2
commit
75bb7f5cdc
|
@ -290,6 +290,7 @@ do_rm_unresponsive(DB, Shard, Site) ->
|
||||||
|
|
||||||
trans_delay(DB, Shard, Trans, Delay, NextHandler) ->
|
trans_delay(DB, Shard, Trans, Delay, NextHandler) ->
|
||||||
ok = delay(Delay),
|
ok = delay(Delay),
|
||||||
|
%% NOTE: Proceed only if the transition we are going to handle is still desired.
|
||||||
case next_transitions(DB, Shard) of
|
case next_transitions(DB, Shard) of
|
||||||
[Trans | _] ->
|
[Trans | _] ->
|
||||||
apply_handler(NextHandler, DB, Shard, Trans);
|
apply_handler(NextHandler, DB, Shard, Trans);
|
||||||
|
@ -338,11 +339,6 @@ handle_transition_exit(Shard, Trans, normal, State = #{db := DB}) ->
|
||||||
handle_transition_exit(_Shard, _Trans, {shutdown, skipped}, State) ->
|
handle_transition_exit(_Shard, _Trans, {shutdown, skipped}, State) ->
|
||||||
State;
|
State;
|
||||||
handle_transition_exit(Shard, Trans, Reason, State) ->
|
handle_transition_exit(Shard, Trans, Reason, State) ->
|
||||||
%% NOTE
|
|
||||||
%% In case of `{add, Site}` transition failure, we have no choice but to retry:
|
|
||||||
%% no other node can perform the transition and make progress towards the desired
|
|
||||||
%% state. For simplicity, we retry any crashed transition handler after a fixed
|
|
||||||
%% delay.
|
|
||||||
logger:warning(#{
|
logger:warning(#{
|
||||||
msg => "Shard membership transition failed",
|
msg => "Shard membership transition failed",
|
||||||
shard => Shard,
|
shard => Shard,
|
||||||
|
@ -350,9 +346,18 @@ handle_transition_exit(Shard, Trans, Reason, State) ->
|
||||||
reason => Reason,
|
reason => Reason,
|
||||||
retry_in => ?CRASH_RETRY_DELAY
|
retry_in => ?CRASH_RETRY_DELAY
|
||||||
}),
|
}),
|
||||||
{Track, Handler} = transition_handler(Shard, Trans, State),
|
%% NOTE
|
||||||
RetryHandler = {fun trans_delay/5, [?CRASH_RETRY_DELAY, Handler]},
|
%% In case of `{add, Site}` transition failure, we have no choice but to retry:
|
||||||
ensure_transition_handler(Track, Shard, Trans, RetryHandler, State).
|
%% no other node can perform the transition and make progress towards the desired
|
||||||
|
%% state.
|
||||||
|
case Trans of
|
||||||
|
{add, _ThisSite} ->
|
||||||
|
{Track, Handler} = transition_handler(Shard, Trans, State),
|
||||||
|
RetryHandler = {fun trans_delay/5, [?CRASH_RETRY_DELAY, Handler]},
|
||||||
|
ensure_transition_handler(Track, Shard, Trans, RetryHandler, State);
|
||||||
|
_Another ->
|
||||||
|
State
|
||||||
|
end.
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue