diff --git a/apps/emqx/src/emqx_router_helper.erl b/apps/emqx/src/emqx_router_helper.erl index 78cf62d6c..8d96bf81d 100644 --- a/apps/emqx/src/emqx_router_helper.erl +++ b/apps/emqx/src/emqx_router_helper.erl @@ -146,13 +146,18 @@ handle_info({mnesia_table_event, Event}, State) -> ?SLOG(debug, #{msg => "unexpected_mnesia_table_event", event => Event}), {noreply, State}; handle_info({nodedown, Node}, State = #{nodes := Nodes}) -> - global:trans( - {?LOCK, self()}, - fun() -> - mria:transaction(?ROUTE_SHARD, fun ?MODULE:cleanup_routes/1, [Node]) - end - ), - ok = mria:dirty_delete(?ROUTING_NODE, Node), + case mria_rlog:role() of + core -> + global:trans( + {?LOCK, self()}, + fun() -> + mria:transaction(?ROUTE_SHARD, fun ?MODULE:cleanup_routes/1, [Node]) + end + ), + ok = mria:dirty_delete(?ROUTING_NODE, Node); + replicant -> + ok + end, ?tp(emqx_router_helper_cleanup_done, #{node => Node}), {noreply, State#{nodes := lists:delete(Node, Nodes)}, hibernate}; handle_info({membership, {mnesia, down, Node}}, State) -> diff --git a/apps/emqx/src/emqx_router_sup.erl b/apps/emqx/src/emqx_router_sup.erl index d0e5ea05a..0fa48d9d2 100644 --- a/apps/emqx/src/emqx_router_sup.erl +++ b/apps/emqx/src/emqx_router_sup.erl @@ -41,4 +41,9 @@ init([]) -> hash, {emqx_router, start_link, []} ]), - {ok, {{one_for_all, 0, 1}, [Helper, RouterPool]}}. + SupFlags = #{ + strategy => one_for_one, + intensity => 10, + period => 100 + }, + {ok, {SupFlags, [Helper, RouterPool]}}. diff --git a/changes/ce/fix-11388.en.md b/changes/ce/fix-11388.en.md new file mode 100644 index 000000000..835155585 --- /dev/null +++ b/changes/ce/fix-11388.en.md @@ -0,0 +1,6 @@ +Increase `emqx_router_sup` restart intensity. + +The goal is to tolerate occasional crashes that can happen under relatively normal conditions +and don't seem critical to shutdown the whole app (emqx). +For example, mria write/delete call delegated from a replicant to a core node by `emqx_router_helper` may fail, +if the core node is being stopped / restarted / not ready.