From f276ea9e91b196f3f5034d65479b761813991f5f Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 3 Aug 2023 13:24:22 +0300 Subject: [PATCH 1/2] fix: increase emqx_router_sup restart intensity The goal is to tolerate occasional crashes that can happen under relatively normal conditions and don't seem critical to shutdown the whole app (emqx). For example, mria write/delete call delegated from a replicant to a core node may fail, if the core node is being stopped / restarted / not ready. Fixes: EMQX-10703, #11310 --- apps/emqx/src/emqx.app.src | 2 +- apps/emqx/src/emqx_router_sup.erl | 7 ++++++- changes/ce/fix-11388.en.md | 6 ++++++ 3 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 changes/ce/fix-11388.en.md diff --git a/apps/emqx/src/emqx.app.src b/apps/emqx/src/emqx.app.src index cff8cf35b..d9598ee1b 100644 --- a/apps/emqx/src/emqx.app.src +++ b/apps/emqx/src/emqx.app.src @@ -2,7 +2,7 @@ {application, emqx, [ {id, "emqx"}, {description, "EMQX Core"}, - {vsn, "5.1.4"}, + {vsn, "5.1.5"}, {modules, []}, {registered, []}, {applications, [ diff --git a/apps/emqx/src/emqx_router_sup.erl b/apps/emqx/src/emqx_router_sup.erl index d0e5ea05a..0fa48d9d2 100644 --- a/apps/emqx/src/emqx_router_sup.erl +++ b/apps/emqx/src/emqx_router_sup.erl @@ -41,4 +41,9 @@ init([]) -> hash, {emqx_router, start_link, []} ]), - {ok, {{one_for_all, 0, 1}, [Helper, RouterPool]}}. + SupFlags = #{ + strategy => one_for_one, + intensity => 10, + period => 100 + }, + {ok, {SupFlags, [Helper, RouterPool]}}. diff --git a/changes/ce/fix-11388.en.md b/changes/ce/fix-11388.en.md new file mode 100644 index 000000000..835155585 --- /dev/null +++ b/changes/ce/fix-11388.en.md @@ -0,0 +1,6 @@ +Increase `emqx_router_sup` restart intensity. + +The goal is to tolerate occasional crashes that can happen under relatively normal conditions +and don't seem critical to shutdown the whole app (emqx). +For example, mria write/delete call delegated from a replicant to a core node by `emqx_router_helper` may fail, +if the core node is being stopped / restarted / not ready. From 2b7798608d0676cc24245104b5ab24ef66dd48bf Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 4 Aug 2023 16:55:36 +0300 Subject: [PATCH 2/2] fix(emqx_router_helper): don't cleanup down node on a replicant The cleanup on a replicant node is redundant, as Mria would delegate this delete op to a core node (via RPC), and the core node is expected to receive the same `nodedown` message and process it. --- apps/emqx/src/emqx_router_helper.erl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/apps/emqx/src/emqx_router_helper.erl b/apps/emqx/src/emqx_router_helper.erl index 78cf62d6c..8d96bf81d 100644 --- a/apps/emqx/src/emqx_router_helper.erl +++ b/apps/emqx/src/emqx_router_helper.erl @@ -146,13 +146,18 @@ handle_info({mnesia_table_event, Event}, State) -> ?SLOG(debug, #{msg => "unexpected_mnesia_table_event", event => Event}), {noreply, State}; handle_info({nodedown, Node}, State = #{nodes := Nodes}) -> - global:trans( - {?LOCK, self()}, - fun() -> - mria:transaction(?ROUTE_SHARD, fun ?MODULE:cleanup_routes/1, [Node]) - end - ), - ok = mria:dirty_delete(?ROUTING_NODE, Node), + case mria_rlog:role() of + core -> + global:trans( + {?LOCK, self()}, + fun() -> + mria:transaction(?ROUTE_SHARD, fun ?MODULE:cleanup_routes/1, [Node]) + end + ), + ok = mria:dirty_delete(?ROUTING_NODE, Node); + replicant -> + ok + end, ?tp(emqx_router_helper_cleanup_done, #{node => Node}), {noreply, State#{nodes := lists:delete(Node, Nodes)}, hibernate}; handle_info({membership, {mnesia, down, Node}}, State) ->