Merge pull request #8202 from zmstone/0613-fix-no-alarm-when-init-rpc-call-failed

fix(cluster_rpc): no alarm when failed to initiate RPC call
This commit is contained in:
Zaiming (Stone) Shi 2022-06-15 11:36:58 +01:00 committed by GitHub
commit c371dd2464
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 41 additions and 47 deletions

View File

@ -32,6 +32,7 @@
]). ]).
-export([ -export([
get_node_tnx_id/1, get_node_tnx_id/1,
get_cluster_tnx_id/0,
latest_tnx_id/0, latest_tnx_id/0,
make_initiate_call_req/3 make_initiate_call_req/3
]). ]).
@ -60,9 +61,11 @@
-include_lib("emqx/include/logger.hrl"). -include_lib("emqx/include/logger.hrl").
-include("emqx_conf.hrl"). -include("emqx_conf.hrl").
-define(INITIATE(MFA, LatestIdLastSeen), {initiate, MFA, LatestIdLastSeen}). -define(INITIATE(MFA), {initiate, MFA}).
-define(CATCH_UP, catch_up). -define(CATCH_UP, catch_up).
-define(TIMEOUT, timer:minutes(1)). -define(TIMEOUT, timer:minutes(1)).
-define(APPLY_KIND_REPLICATE, replicate).
-define(APPLY_KIND_INITIATE, initiate).
-type tnx_id() :: pos_integer(). -type tnx_id() :: pos_integer().
@ -74,7 +77,7 @@
| {peers_lagging, tnx_id(), Result, [node()]}. | {peers_lagging, tnx_id(), Result, [node()]}.
-type multicall_return() :: multicall_return(_). -type multicall_return() :: multicall_return(_).
-type init_call_req() :: ?INITIATE({module(), atom(), list()}, tnx_id()). -type init_call_req() :: ?INITIATE({module(), atom(), list()}).
%%%=================================================================== %%%===================================================================
%%% API %%% API
@ -146,25 +149,11 @@ multicall(M, F, A, RequiredSyncs, Timeout) when RequiredSyncs =:= all orelse Req
%% return {init_failure, Error} when the initial MFA result is no ok or {ok, term()}. %% return {init_failure, Error} when the initial MFA result is no ok or {ok, term()}.
%% return {peers_lagging, TnxId, MFARes, Nodes} when some Nodes failed and some Node ok. %% return {peers_lagging, TnxId, MFARes, Nodes} when some Nodes failed and some Node ok.
-spec do_multicall(module(), atom(), list(), succeed_num(), timeout()) -> multicall_return(). -spec do_multicall(module(), atom(), list(), succeed_num(), timeout()) -> multicall_return().
do_multicall(M, F, A, RequiredSyncs, Timeout) when do_multicall(M, F, A, RequiredSyncs, Timeout) ->
RequiredSyncs =:= all orelse RequiredSyncs >= 1 %% assert
-> true = (RequiredSyncs =:= all orelse RequiredSyncs >= 1),
%% Idealy 'LatestId' should be provided by the multicall originator,
%% which is the viewer of the state e.g.
%% * Sysadmin who issues CLI-commands or REST-API calls to make config changes
%% * Dashboard viewer who is making decision based on what they can see from the UI
%% To reach the ideal state, it would require adding transaction ID to each and
%% every view/GET requests and also provide the ID as a part of the view/GET responses.
%%
%% To keep things simple, we try to get the 'old' view when a multicall request
%% is received as early as possible.
%%
%% Reason to do this:
%% The 'initiate' call handler tries to take a table lock (cluster-wide) before
%% bumping the transaction ID. While waiting for the lock, the ID might have been
%% bumpped by another node in the cluster.
InitReq = make_initiate_call_req(M, F, A),
Begin = erlang:monotonic_time(), Begin = erlang:monotonic_time(),
InitReq = make_initiate_call_req(M, F, A),
InitRes = InitRes =
case mria_rlog:role() of case mria_rlog:role() of
core -> core ->
@ -216,13 +205,12 @@ status() ->
-spec latest_tnx_id() -> pos_integer(). -spec latest_tnx_id() -> pos_integer().
latest_tnx_id() -> latest_tnx_id() ->
{atomic, TnxId} = transaction(fun get_latest_id/0, []), {atomic, TnxId} = transaction(fun get_cluster_tnx_id/0, []),
TnxId. TnxId.
-spec make_initiate_call_req(module(), atom(), list()) -> init_call_req(). -spec make_initiate_call_req(module(), atom(), list()) -> init_call_req().
make_initiate_call_req(M, F, A) -> make_initiate_call_req(M, F, A) ->
TnxId = get_latest_id(dirty), ?INITIATE({M, F, A}).
?INITIATE({M, F, A}, TnxId).
-spec get_node_tnx_id(node()) -> integer(). -spec get_node_tnx_id(node()) -> integer().
get_node_tnx_id(Node) -> get_node_tnx_id(Node) ->
@ -289,8 +277,8 @@ handle_call(reset, _From, State) ->
_ = mria:clear_table(?CLUSTER_COMMIT), _ = mria:clear_table(?CLUSTER_COMMIT),
_ = mria:clear_table(?CLUSTER_MFA), _ = mria:clear_table(?CLUSTER_MFA),
{reply, ok, State, {continue, ?CATCH_UP}}; {reply, ok, State, {continue, ?CATCH_UP}};
handle_call(?INITIATE(MFA, LatestId), _From, State = #{node := Node}) -> handle_call(?INITIATE(MFA), _From, State = #{node := Node}) ->
case transaction(fun init_mfa/3, [Node, MFA, LatestId]) of case transaction(fun init_mfa/2, [Node, MFA]) of
{atomic, {ok, TnxId, Result}} -> {atomic, {ok, TnxId, Result}} ->
{reply, {ok, TnxId, Result}, State, {continue, ?CATCH_UP}}; {reply, {ok, TnxId, Result}, State, {continue, ?CATCH_UP}};
{aborted, Error} -> {aborted, Error} ->
@ -330,7 +318,7 @@ catch_up(#{node := Node, retry_interval := RetryMs} = State, SkipResult) ->
{atomic, caught_up} -> {atomic, caught_up} ->
?TIMEOUT; ?TIMEOUT;
{atomic, {still_lagging, NextId, MFA}} -> {atomic, {still_lagging, NextId, MFA}} ->
{Succeed, _} = apply_mfa(NextId, MFA, catch_up), {Succeed, _} = apply_mfa(NextId, MFA, ?APPLY_KIND_REPLICATE),
case Succeed orelse SkipResult of case Succeed orelse SkipResult of
true -> true ->
case transaction(fun commit/2, [Node, NextId]) of case transaction(fun commit/2, [Node, NextId]) of
@ -356,7 +344,7 @@ read_next_mfa(Node) ->
NextId = NextId =
case mnesia:wread({?CLUSTER_COMMIT, Node}) of case mnesia:wread({?CLUSTER_COMMIT, Node}) of
[] -> [] ->
LatestId = get_latest_id(), LatestId = get_cluster_tnx_id(),
TnxId = max(LatestId - 1, 0), TnxId = max(LatestId - 1, 0),
commit(Node, TnxId), commit(Node, TnxId),
?SLOG(notice, #{ ?SLOG(notice, #{
@ -382,7 +370,7 @@ do_fast_forward_to_commit(ToTnxId, State = #{node := Node}) ->
true -> true ->
NodeId; NodeId;
false -> false ->
{atomic, LatestId} = transaction(fun get_latest_id/0, []), {atomic, LatestId} = transaction(fun get_cluster_tnx_id/0, []),
case LatestId =< NodeId of case LatestId =< NodeId of
true -> true ->
NodeId; NodeId;
@ -392,24 +380,17 @@ do_fast_forward_to_commit(ToTnxId, State = #{node := Node}) ->
end end
end. end.
get_latest_id() -> get_cluster_tnx_id() ->
get_latest_id(tnx). case mnesia:last(?CLUSTER_MFA) of
get_latest_id(IsolationLevel) ->
F =
case IsolationLevel of
tnx -> fun mnesia:last/1;
dirty -> fun mnesia:dirty_last/1
end,
case F(?CLUSTER_MFA) of
'$end_of_table' -> 0; '$end_of_table' -> 0;
Id -> Id Id -> Id
end. end.
init_mfa(Node, MFA, LatestIdLastSeen) -> init_mfa(Node, MFA) ->
mnesia:write_lock_table(?CLUSTER_MFA), mnesia:write_lock_table(?CLUSTER_MFA),
LatestId = get_latest_id(), LatestId = get_cluster_tnx_id(),
case LatestIdLastSeen =:= LatestId of MyTnxId = get_node_tnx_id(node()),
case MyTnxId =:= LatestId of
true -> true ->
TnxId = LatestId + 1, TnxId = LatestId + 1,
MFARec = #cluster_rpc_mfa{ MFARec = #cluster_rpc_mfa{
@ -420,17 +401,21 @@ init_mfa(Node, MFA, LatestIdLastSeen) ->
}, },
ok = mnesia:write(?CLUSTER_MFA, MFARec, write), ok = mnesia:write(?CLUSTER_MFA, MFARec, write),
ok = commit(Node, TnxId), ok = commit(Node, TnxId),
case apply_mfa(TnxId, MFA, init) of case apply_mfa(TnxId, MFA, ?APPLY_KIND_INITIATE) of
{true, Result} -> {ok, TnxId, Result}; {true, Result} -> {ok, TnxId, Result};
{false, Error} -> mnesia:abort(Error) {false, Error} -> mnesia:abort(Error)
end; end;
false -> false ->
?SLOG(error, #{ %% refuse to initiate cluster call from this node
%% because it's likely that the caller is based on
%% a stale view.
Reason = #{
msg => stale_view_of_cluster_state, msg => stale_view_of_cluster_state,
tnx_id => LatestId, cluster_tnx_id => LatestId,
last_seen_tnx_id => LatestIdLastSeen node_tnx_id => MyTnxId
}), },
mnesia:abort({error, stale_view_of_cluster_state}) ?SLOG(warning, Reason),
mnesia:abort({error, Reason})
end. end.
transaction(Func, Args) -> transaction(Func, Args) ->
@ -495,6 +480,15 @@ is_success(ok) -> true;
is_success({ok, _}) -> true; is_success({ok, _}) -> true;
is_success(_) -> false. is_success(_) -> false.
log_and_alarm(IsSuccess, Res, #{kind := ?APPLY_KIND_INITIATE} = Meta) ->
%% no alarm or error log in case of failure at originating a new cluster-call
%% because nothing is committed
case IsSuccess of
true ->
?SLOG(debug, Meta#{msg => "cluster_rpc_apply_result", result => Res});
false ->
?SLOG(warning, Meta#{msg => "cluster_rpc_apply_result", result => Res})
end;
log_and_alarm(true, Res, Meta) -> log_and_alarm(true, Res, Meta) ->
?SLOG(debug, Meta#{msg => "cluster_rpc_apply_ok", result => Res}), ?SLOG(debug, Meta#{msg => "cluster_rpc_apply_ok", result => Res}),
do_alarm(deactivate, Res, Meta); do_alarm(deactivate, Res, Meta);