From ea86f4442b966b9487ef3749e2f2da05ee510ffa Mon Sep 17 00:00:00 2001 From: Paulo Zulato Date: Fri, 19 May 2023 20:16:05 -0300 Subject: [PATCH] fix: avoid error 500 when node is re-joining cluster Fixes https://emqx.atlassian.net/browse/EMQX-9899 --- apps/emqx/src/emqx_rpc.erl | 2 +- apps/emqx_authz/src/emqx_authz_api_mnesia.erl | 8 ++-- apps/emqx_bridge/src/emqx_bridge_api.erl | 9 ++++- apps/emqx_ctl/src/emqx_ctl.app.src | 2 +- apps/emqx_ctl/src/emqx_ctl.erl | 2 +- apps/emqx_gateway/src/emqx_gateway.app.src | 2 +- .../src/emqx_gateway_api_clients.erl | 6 ++- apps/emqx_management/src/emqx_mgmt_api.erl | 40 +++++++++++++------ .../src/emqx_mgmt_api_alarms.erl | 4 +- .../src/emqx_mgmt_api_topics.erl | 4 +- .../src/emqx_rule_engine_api.erl | 3 ++ changes/ce/fix-10760.en.md | 1 + 12 files changed, 55 insertions(+), 28 deletions(-) create mode 100644 changes/ce/fix-10760.en.md diff --git a/apps/emqx/src/emqx_rpc.erl b/apps/emqx/src/emqx_rpc.erl index 062bde68b..86ec5937f 100644 --- a/apps/emqx/src/emqx_rpc.erl +++ b/apps/emqx/src/emqx_rpc.erl @@ -147,7 +147,7 @@ unwrap_erpc({throw, A}) -> {error, A}; unwrap_erpc({error, {exception, Err, _Stack}}) -> {error, Err}; -unwrap_erpc({error, {exit, Err}}) -> +unwrap_erpc({exit, Err}) -> {error, Err}; unwrap_erpc({error, {erpc, Err}}) -> {error, Err}. diff --git a/apps/emqx_authz/src/emqx_authz_api_mnesia.erl b/apps/emqx_authz/src/emqx_authz_api_mnesia.erl index b39379b43..f5ac40f5e 100644 --- a/apps/emqx_authz/src/emqx_authz_api_mnesia.erl +++ b/apps/emqx_authz/src/emqx_authz_api_mnesia.erl @@ -423,8 +423,8 @@ users(get, #{query_string := QueryString}) -> of {error, page_limit_invalid} -> {400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}}; - {error, Node, {badrpc, R}} -> - Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])), + {error, Node, Error} -> + Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])), {500, #{code => <<"NODE_DOWN">>, message => Message}}; Result -> {200, Result} @@ -459,8 +459,8 @@ clients(get, #{query_string := QueryString}) -> of {error, page_limit_invalid} -> {400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}}; - {error, Node, {badrpc, R}} -> - Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])), + {error, Node, Error} -> + Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])), {500, #{code => <<"NODE_DOWN">>, message => Message}}; Result -> {200, Result} diff --git a/apps/emqx_bridge/src/emqx_bridge_api.erl b/apps/emqx_bridge/src/emqx_bridge_api.erl index 847270664..026c24ae2 100644 --- a/apps/emqx_bridge/src/emqx_bridge_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_api.erl @@ -756,7 +756,14 @@ format_bridge_info([FirstBridge | _] = Bridges) -> }). format_bridge_metrics(Bridges) -> - NodeMetrics = collect_metrics(Bridges), + FilteredBridges = lists:filter( + fun + ({_Node, Metric}) when is_map(Metric) -> true; + (_) -> false + end, + Bridges + ), + NodeMetrics = collect_metrics(FilteredBridges), #{ metrics => aggregate_metrics(NodeMetrics), node_metrics => NodeMetrics diff --git a/apps/emqx_ctl/src/emqx_ctl.app.src b/apps/emqx_ctl/src/emqx_ctl.app.src index 9de598a89..c3abade67 100644 --- a/apps/emqx_ctl/src/emqx_ctl.app.src +++ b/apps/emqx_ctl/src/emqx_ctl.app.src @@ -1,6 +1,6 @@ {application, emqx_ctl, [ {description, "Backend for emqx_ctl script"}, - {vsn, "0.1.0"}, + {vsn, "0.1.1"}, {registered, []}, {mod, {emqx_ctl_app, []}}, {applications, [ diff --git a/apps/emqx_ctl/src/emqx_ctl.erl b/apps/emqx_ctl/src/emqx_ctl.erl index 864b53d2a..6123056b9 100644 --- a/apps/emqx_ctl/src/emqx_ctl.erl +++ b/apps/emqx_ctl/src/emqx_ctl.erl @@ -228,7 +228,7 @@ handle_call({register_command, Cmd, MF, Opts}, _From, State = #state{seq = Seq}) ets:insert(?CMD_TAB, {{Seq, Cmd}, MF, Opts}), {reply, ok, next_seq(State)}; [[OriginSeq] | _] -> - ?LOG_WARNING(#{msg => "CMD_overidden", cmd => Cmd, mf => MF}), + ?LOG_WARNING(#{msg => "CMD_overridden", cmd => Cmd, mf => MF}), true = ets:insert(?CMD_TAB, {{OriginSeq, Cmd}, MF, Opts}), {reply, ok, State} end; diff --git a/apps/emqx_gateway/src/emqx_gateway.app.src b/apps/emqx_gateway/src/emqx_gateway.app.src index 2ffca464d..26dab4c9c 100644 --- a/apps/emqx_gateway/src/emqx_gateway.app.src +++ b/apps/emqx_gateway/src/emqx_gateway.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_gateway, [ {description, "The Gateway management application"}, - {vsn, "0.1.16"}, + {vsn, "0.1.17"}, {registered, []}, {mod, {emqx_gateway_app, []}}, {applications, [kernel, stdlib, emqx, emqx_authn, emqx_ctl]}, diff --git a/apps/emqx_gateway/src/emqx_gateway_api_clients.erl b/apps/emqx_gateway/src/emqx_gateway_api_clients.erl index cd387e3bb..7f8f30c8b 100644 --- a/apps/emqx_gateway/src/emqx_gateway_api_clients.erl +++ b/apps/emqx_gateway/src/emqx_gateway_api_clients.erl @@ -133,8 +133,10 @@ clients(get, #{ case Result of {error, page_limit_invalid} -> {400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}}; - {error, Node, {badrpc, R}} -> - Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])), + {error, Node, Error} -> + Message = list_to_binary( + io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error]) + ), {500, #{code => <<"NODE_DOWN">>, message => Message}}; Response -> {200, Response} diff --git a/apps/emqx_management/src/emqx_mgmt_api.erl b/apps/emqx_management/src/emqx_mgmt_api.erl index 8365b983c..7280c17b6 100644 --- a/apps/emqx_management/src/emqx_mgmt_api.erl +++ b/apps/emqx_management/src/emqx_mgmt_api.erl @@ -134,8 +134,8 @@ do_node_query( ResultAcc ) -> case do_query(Node, QueryState) of - {error, {badrpc, R}} -> - {error, Node, {badrpc, R}}; + {error, Error} -> + {error, Node, Error}; {Rows, NQueryState = #{complete := Complete}} -> case accumulate_query_rows(Node, Rows, NQueryState, ResultAcc) of {enough, NResultAcc} -> @@ -179,8 +179,8 @@ do_cluster_query( ResultAcc ) -> case do_query(Node, QueryState) of - {error, {badrpc, R}} -> - {error, Node, {badrpc, R}}; + {error, Error} -> + {error, Node, Error}; {Rows, NQueryState = #{complete := Complete}} -> case accumulate_query_rows(Node, Rows, NQueryState, ResultAcc) of {enough, NResultAcc} -> @@ -275,7 +275,7 @@ do_query(Node, QueryState) when Node =:= node() -> do_select(Node, QueryState); do_query(Node, QueryState) -> case - rpc:call( + catch rpc:call( Node, ?MODULE, do_query, @@ -284,6 +284,7 @@ do_query(Node, QueryState) -> ) of {badrpc, _} = R -> {error, R}; + {'EXIT', _} = R -> {error, R}; Ret -> Ret end. @@ -298,15 +299,24 @@ do_select( ) -> QueryState = maybe_apply_total_query(Node, QueryState0), Result = - case maps:get(continuation, QueryState, undefined) of - undefined -> - ets:select(Tab, Ms, Limit); - Continuation -> - %% XXX: Repair is necessary because we pass Continuation back - %% and forth through the nodes in the `do_cluster_query` - ets:select(ets:repair_continuation(Continuation, Ms)) + try + case maps:get(continuation, QueryState, undefined) of + undefined -> + ets:select(Tab, Ms, Limit); + Continuation -> + %% XXX: Repair is necessary because we pass Continuation back + %% and forth through the nodes in the `do_cluster_query` + ets:select(ets:repair_continuation(Continuation, Ms)) + end + catch + exit:_ = Exit -> + {error, Exit}; + Type:Reason:Stack -> + {error, #{exception => Type, reason => Reason, stacktrace => Stack}} end, case Result of + {error, _} -> + {[], mark_complete(QueryState)}; {Rows, '$end_of_table'} -> NRows = maybe_apply_fuzzy_filter(Rows, QueryState), {NRows, mark_complete(QueryState)}; @@ -354,7 +364,11 @@ counting_total_fun(_QueryState = #{match_spec := Ms, fuzzy_fun := undefined}) -> [{MatchHead, Conditions, _Return}] = Ms, CountingMs = [{MatchHead, Conditions, [true]}], fun(Tab) -> - ets:select_count(Tab, CountingMs) + try + ets:select_count(Tab, CountingMs) + catch + _Type:_Reason -> 0 + end end; counting_total_fun(_QueryState = #{fuzzy_fun := FuzzyFun}) when FuzzyFun =/= undefined -> %% XXX: Calculating the total number for a fuzzy searching is very very expensive diff --git a/apps/emqx_management/src/emqx_mgmt_api_alarms.erl b/apps/emqx_management/src/emqx_mgmt_api_alarms.erl index 88c2be89d..d5965f019 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_alarms.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_alarms.erl @@ -123,8 +123,8 @@ alarms(get, #{query_string := QString}) -> of {error, page_limit_invalid} -> {400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}}; - {error, Node, {badrpc, R}} -> - Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])), + {error, Node, Error} -> + Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])), {500, #{code => <<"NODE_DOWN">>, message => Message}}; Response -> {200, Response} diff --git a/apps/emqx_management/src/emqx_mgmt_api_topics.erl b/apps/emqx_management/src/emqx_mgmt_api_topics.erl index d451261ff..f6e790bf4 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_topics.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_topics.erl @@ -120,8 +120,8 @@ do_list(Params) -> of {error, page_limit_invalid} -> {400, #{code => <<"INVALID_PARAMETER">>, message => <<"page_limit_invalid">>}}; - {error, Node, {badrpc, R}} -> - Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, R])), + {error, Node, Error} -> + Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])), {500, #{code => <<"NODE_DOWN">>, message => Message}}; Response -> {200, Response} diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl b/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl index d66f2c1c9..d36ec6a40 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_engine_api.erl @@ -339,6 +339,9 @@ param_path_id() -> of {error, page_limit_invalid} -> {400, #{code => 'BAD_REQUEST', message => <<"page_limit_invalid">>}}; + {error, Node, Error} -> + Message = list_to_binary(io_lib:format("bad rpc call ~p, Reason ~p", [Node, Error])), + {500, #{code => <<"NODE_DOWN">>, message => Message}}; Result -> {200, Result} end; diff --git a/changes/ce/fix-10760.en.md b/changes/ce/fix-10760.en.md new file mode 100644 index 000000000..42c71e66a --- /dev/null +++ b/changes/ce/fix-10760.en.md @@ -0,0 +1 @@ +Fix Internal Error 500 that occurred sometimes when bridge statistics page was updated while a node was (re)joining the cluster.