fix: avoid error 500 when node is re-joining cluster

Fixes https://emqx.atlassian.net/browse/EMQX-9899
This commit is contained in:
Paulo Zulato 2023-05-19 20:16:05 -03:00
parent 1f36726cab
commit ea86f4442b
12 changed files with 55 additions and 28 deletions

View File

@ -147,7 +147,7 @@ unwrap_erpc({throw, A}) ->
{error, A};
unwrap_erpc({error, {exception, Err, _Stack}}) ->
{error, Err};
unwrap_erpc({error, {exit, Err}}) ->
unwrap_erpc({exit, Err}) ->
{error, Err};
unwrap_erpc({error, {erpc, Err}}) ->
{error, Err}.

View File

@ -423,8 +423,8 @@ users(get, #{query_string := QueryString}) ->
of
{error, page_limit_invalid} ->
{400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}};
{error, Node, {badrpc, R}} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])),
{error, Node, Error} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])),
{500, #{code => <<"NODE_DOWN">>, message => Message}};
Result ->
{200, Result}
@ -459,8 +459,8 @@ clients(get, #{query_string := QueryString}) ->
of
{error, page_limit_invalid} ->
{400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}};
{error, Node, {badrpc, R}} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])),
{error, Node, Error} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])),
{500, #{code => <<"NODE_DOWN">>, message => Message}};
Result ->
{200, Result}

View File

@ -756,7 +756,14 @@ format_bridge_info([FirstBridge | _] = Bridges) ->
}).
format_bridge_metrics(Bridges) ->
NodeMetrics = collect_metrics(Bridges),
FilteredBridges = lists:filter(
fun
({_Node, Metric}) when is_map(Metric) -> true;
(_) -> false
end,
Bridges
),
NodeMetrics = collect_metrics(FilteredBridges),
#{
metrics => aggregate_metrics(NodeMetrics),
node_metrics => NodeMetrics

View File

@ -1,6 +1,6 @@
{application, emqx_ctl, [
{description, "Backend for emqx_ctl script"},
{vsn, "0.1.0"},
{vsn, "0.1.1"},
{registered, []},
{mod, {emqx_ctl_app, []}},
{applications, [

View File

@ -228,7 +228,7 @@ handle_call({register_command, Cmd, MF, Opts}, _From, State = #state{seq = Seq})
ets:insert(?CMD_TAB, {{Seq, Cmd}, MF, Opts}),
{reply, ok, next_seq(State)};
[[OriginSeq] | _] ->
?LOG_WARNING(#{msg => "CMD_overidden", cmd => Cmd, mf => MF}),
?LOG_WARNING(#{msg => "CMD_overridden", cmd => Cmd, mf => MF}),
true = ets:insert(?CMD_TAB, {{OriginSeq, Cmd}, MF, Opts}),
{reply, ok, State}
end;

View File

@ -1,7 +1,7 @@
%% -*- mode: erlang -*-
{application, emqx_gateway, [
{description, "The Gateway management application"},
{vsn, "0.1.16"},
{vsn, "0.1.17"},
{registered, []},
{mod, {emqx_gateway_app, []}},
{applications, [kernel, stdlib, emqx, emqx_authn, emqx_ctl]},

View File

@ -133,8 +133,10 @@ clients(get, #{
case Result of
{error, page_limit_invalid} ->
{400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}};
{error, Node, {badrpc, R}} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])),
{error, Node, Error} ->
Message = list_to_binary(
io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])
),
{500, #{code => <<"NODE_DOWN">>, message => Message}};
Response ->
{200, Response}

View File

@ -134,8 +134,8 @@ do_node_query(
ResultAcc
) ->
case do_query(Node, QueryState) of
{error, {badrpc, R}} ->
{error, Node, {badrpc, R}};
{error, Error} ->
{error, Node, Error};
{Rows, NQueryState = #{complete := Complete}} ->
case accumulate_query_rows(Node, Rows, NQueryState, ResultAcc) of
{enough, NResultAcc} ->
@ -179,8 +179,8 @@ do_cluster_query(
ResultAcc
) ->
case do_query(Node, QueryState) of
{error, {badrpc, R}} ->
{error, Node, {badrpc, R}};
{error, Error} ->
{error, Node, Error};
{Rows, NQueryState = #{complete := Complete}} ->
case accumulate_query_rows(Node, Rows, NQueryState, ResultAcc) of
{enough, NResultAcc} ->
@ -275,7 +275,7 @@ do_query(Node, QueryState) when Node =:= node() ->
do_select(Node, QueryState);
do_query(Node, QueryState) ->
case
rpc:call(
catch rpc:call(
Node,
?MODULE,
do_query,
@ -284,6 +284,7 @@ do_query(Node, QueryState) ->
)
of
{badrpc, _} = R -> {error, R};
{'EXIT', _} = R -> {error, R};
Ret -> Ret
end.
@ -298,15 +299,24 @@ do_select(
) ->
QueryState = maybe_apply_total_query(Node, QueryState0),
Result =
case maps:get(continuation, QueryState, undefined) of
undefined ->
ets:select(Tab, Ms, Limit);
Continuation ->
%% XXX: Repair is necessary because we pass Continuation back
%% and forth through the nodes in the `do_cluster_query`
ets:select(ets:repair_continuation(Continuation, Ms))
try
case maps:get(continuation, QueryState, undefined) of
undefined ->
ets:select(Tab, Ms, Limit);
Continuation ->
%% XXX: Repair is necessary because we pass Continuation back
%% and forth through the nodes in the `do_cluster_query`
ets:select(ets:repair_continuation(Continuation, Ms))
end
catch
exit:_ = Exit ->
{error, Exit};
Type:Reason:Stack ->
{error, #{exception => Type, reason => Reason, stacktrace => Stack}}
end,
case Result of
{error, _} ->
{[], mark_complete(QueryState)};
{Rows, '$end_of_table'} ->
NRows = maybe_apply_fuzzy_filter(Rows, QueryState),
{NRows, mark_complete(QueryState)};
@ -354,7 +364,11 @@ counting_total_fun(_QueryState = #{match_spec := Ms, fuzzy_fun := undefined}) ->
[{MatchHead, Conditions, _Return}] = Ms,
CountingMs = [{MatchHead, Conditions, [true]}],
fun(Tab) ->
ets:select_count(Tab, CountingMs)
try
ets:select_count(Tab, CountingMs)
catch
_Type:_Reason -> 0
end
end;
counting_total_fun(_QueryState = #{fuzzy_fun := FuzzyFun}) when FuzzyFun =/= undefined ->
%% XXX: Calculating the total number for a fuzzy searching is very very expensive

View File

@ -123,8 +123,8 @@ alarms(get, #{query_string := QString}) ->
of
{error, page_limit_invalid} ->
{400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}};
{error, Node, {badrpc, R}} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])),
{error, Node, Error} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])),
{500, #{code => <<"NODE_DOWN">>, message => Message}};
Response ->
{200, Response}

View File

@ -120,8 +120,8 @@ do_list(Params) ->
of
{error, page_limit_invalid} ->
{400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}};
{error, Node, {badrpc, R}} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])),
{error, Node, Error} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])),
{500, #{code => <<"NODE_DOWN">>, message => Message}};
Response ->
{200, Response}

View File

@ -339,6 +339,9 @@ param_path_id() ->
of
{error, page_limit_invalid} ->
{400, #{code => 'BAD_REQUEST', message => <<"page_limit_invalid">>}};
{error, Node, Error} ->
Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])),
{500, #{code => <<"NODE_DOWN">>, message => Message}};
Result ->
{200, Result}
end;

View File

@ -0,0 +1 @@
Fix Internal Error 500 that occurred sometimes when bridge statistics page was updated while a node was (re)joining the cluster.