feat: add graceful shutdown

prior to this cahnge emqx node shutdown is done by init:stop
which might have undesired stop order of the applications

in this change, emqx_machine_terminator is added to stop apps
in defined order and then terminate the node in infinite loop
This commit is contained in:
Zaiming Shi 2021-08-04 21:40:25 +02:00
parent 70e49ab629
commit bc23ff5e47
7 changed files with 113 additions and 17 deletions

View File

@ -0,0 +1,27 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2021 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-module(emqx_machine).
-export([start/0,
graceful_shutdown/0
]).
start() ->
ok = emqx_machine_terminator:start().
graceful_shutdown() ->
emqx_machine_terminator:graceful().

View File

@ -18,11 +18,9 @@
-export([ start/2
, stop/1
, prep_stop/1
]).
%% Shutdown and reboot
-export([ shutdown/1
-export([ stop_apps/1
, ensure_apps_started/0
]).
@ -50,11 +48,9 @@ start(_Type, _Args) ->
ok = print_vsn(),
ok = start_autocluster(),
ok = emqx_machine:start(),
{ok, RootSupPid}.
prep_stop(_State) ->
application:stop(emqx).
stop(_State) ->
ok.
@ -96,13 +92,13 @@ load_config_files() ->
ok = emqx_app:set_init_config_load_done().
start_autocluster() ->
ekka:callback(prepare, fun ?MODULE:shutdown/1),
ekka:callback(prepare, fun ?MODULE:stop_apps/1),
ekka:callback(reboot, fun ?MODULE:ensure_apps_started/0),
_ = ekka:autocluster(emqx), %% returns 'ok' or a pid or 'any()' as in spec
ok.
shutdown(Reason) ->
?SLOG(critical, #{msg => "stopping_apps", reason => Reason}),
stop_apps(Reason) ->
?SLOG(info, #{msg => "stopping_apps", reason => Reason}),
_ = emqx_alarm_handler:unload(),
lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())).

View File

@ -0,0 +1,67 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2021 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-module(emqx_machine_terminator).
-export([ start/0
, graceful/0
, terminator_loop/0
]).
-define(TERMINATOR, ?MODULE).
%% @doc This API is called to shutdown the Erlang VM by RPC call from remote shell node.
%% The shutown of apps is delegated to a to a process instead of doing it in the RPC spawned
%% process which has a remote group leader.
start() ->
_ = spawn_link(
fun() ->
register(?TERMINATOR, self()),
terminator_loop()
end),
ok.
%% internal use
terminator_loop() ->
receive
graceful_shutdown ->
ok = emqx_machine_app:stop_apps(normal),
exit_loop()
after
1000 ->
%% keep looping for beam reload
?MODULE:terminator_loop()
end.
%% @doc Shutdown the Erlang VM.
graceful() ->
case whereis(?TERMINATOR) of
undefined ->
exit(emqx_machine_not_started);
Pid ->
Pid ! graceful_shutdown,
Ref = monitor(process, Pid),
%% NOTE: not exactly sure, but maybe there is a chance that
%% Erlang VM goes down before this receive.
%% In which case, the remote caller will get {badrpc, nodedown}
receive {'DOWN', Ref, process, Pid, _} -> ok end
end.
%% Loop until Erlang VM exits
exit_loop() ->
init:stop(),
timer:sleep(100),
exit_loop().

View File

@ -33,9 +33,9 @@ end_per_suite(_Config) ->
emqx_ct_helpers:stop_apps([]).
t_shutdown_reboot(_Config) ->
emqx_machine_app:shutdown(normal),
emqx_machine_app:stop_apps(normal),
false = emqx:is_running(node()),
emqx_machine_app:ensure_apps_started(),
true = emqx:is_running(node()),
ok = emqx_machine_app:shutdown(for_test),
ok = emqx_machine_app:stop_apps(for_test),
false = emqx:is_running(node()).

View File

@ -99,7 +99,7 @@ relx_usage() {
echo " don't make it permanent"
;;
*)
echo "Usage: $REL_NAME {start|start_boot <file>|ertspath|foreground|stop|restart|reboot|pid|ping|console|console_clean|console_boot <file>|attach|remote_console|upgrade|downgrade|install|uninstall|versions|escript|ctl|rpc|rpcterms|eval|root_dir}"
echo "Usage: $REL_NAME {start|start_boot <file>|ertspath|foreground|stop|pid|ping|console|console_clean|console_boot <file>|attach|remote_console|upgrade|downgrade|install|uninstall|versions|escript|ctl|rpc|rpcterms|eval|root_dir}"
;;
esac
}

View File

@ -72,9 +72,15 @@ do(Args) ->
%% a "pong"
io:format("pong\n");
["stop"] ->
io:format("~p\n", [rpc:call(TargetNode, init, stop, [], 60000)]);
["restart", "-config", ConfigFile | _RestArgs1] ->
io:format("~p\n", [rpc:call(TargetNode, emqx, restart, [ConfigFile], 60000)]);
case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of
ok ->
ok;
{badrpc, nodedown} ->
%% nodetool commands are always executed after a ping
%% which if the code gets here, it's because the target node
%% has shutdown before RPC returns.
ok
end;
["rpc", Module, Function | RpcArgs] ->
case rpc:call(TargetNode, list_to_atom(Module), list_to_atom(Function),
[RpcArgs], 60000) of
@ -141,7 +147,7 @@ do(Args) ->
end;
Other ->
io:format("Other: ~p\n", [Other]),
io:format("Usage: nodetool {genconfig, chkconfig|getpid|ping|stop|restart|reboot|rpc|rpc_infinity|rpcterms|eval [Terms]} [RPC]\n")
io:format("Usage: nodetool {genconfig, chkconfig|getpid|ping|stop|rpc|rpc_infinity|rpcterms|eval [Terms]} [RPC]\n")
end,
net_kernel:stop().