Merge pull request #11388 from SergeTupchiy/EMQX-10703-fix-replicant-crash-when-core-terminates-abnormally

fix: increase emqx_router_sup restart intensity
This commit is contained in:
zhongwencool 2023-08-06 14:40:32 +08:00 committed by GitHub
commit 665695a977
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 24 additions and 8 deletions

View File

@ -146,13 +146,18 @@ handle_info({mnesia_table_event, Event}, State) ->
?SLOG(debug, #{msg => "unexpected_mnesia_table_event", event => Event}), ?SLOG(debug, #{msg => "unexpected_mnesia_table_event", event => Event}),
{noreply, State}; {noreply, State};
handle_info({nodedown, Node}, State = #{nodes := Nodes}) -> handle_info({nodedown, Node}, State = #{nodes := Nodes}) ->
case mria_rlog:role() of
core ->
global:trans( global:trans(
{?LOCK, self()}, {?LOCK, self()},
fun() -> fun() ->
mria:transaction(?ROUTE_SHARD, fun ?MODULE:cleanup_routes/1, [Node]) mria:transaction(?ROUTE_SHARD, fun ?MODULE:cleanup_routes/1, [Node])
end end
), ),
ok = mria:dirty_delete(?ROUTING_NODE, Node), ok = mria:dirty_delete(?ROUTING_NODE, Node);
replicant ->
ok
end,
?tp(emqx_router_helper_cleanup_done, #{node => Node}), ?tp(emqx_router_helper_cleanup_done, #{node => Node}),
{noreply, State#{nodes := lists:delete(Node, Nodes)}, hibernate}; {noreply, State#{nodes := lists:delete(Node, Nodes)}, hibernate};
handle_info({membership, {mnesia, down, Node}}, State) -> handle_info({membership, {mnesia, down, Node}}, State) ->

View File

@ -41,4 +41,9 @@ init([]) ->
hash, hash,
{emqx_router, start_link, []} {emqx_router, start_link, []}
]), ]),
{ok, {{one_for_all, 0, 1}, [Helper, RouterPool]}}. SupFlags = #{
strategy => one_for_one,
intensity => 10,
period => 100
},
{ok, {SupFlags, [Helper, RouterPool]}}.

View File

@ -0,0 +1,6 @@
Increase `emqx_router_sup` restart intensity.
The goal is to tolerate occasional crashes that can happen under relatively normal conditions
and don't seem critical to shutdown the whole app (emqx).
For example, mria write/delete call delegated from a replicant to a core node by `emqx_router_helper` may fail,
if the core node is being stopped / restarted / not ready.