From bc23ff5e4709eb69812e5b79c6f3649a34b5ce7c Mon Sep 17 00:00:00 2001 From: Zaiming Shi Date: Wed, 4 Aug 2021 21:40:25 +0200 Subject: [PATCH] feat: add graceful shutdown prior to this cahnge emqx node shutdown is done by init:stop which might have undesired stop order of the applications in this change, emqx_machine_terminator is added to stop apps in defined order and then terminate the node in infinite loop --- .ci/build_packages/tests.sh | 2 +- apps/emqx_machine/src/emqx_machine.erl | 27 ++++++++ apps/emqx_machine/src/emqx_machine_app.erl | 14 ++-- .../src/emqx_machine_terminator.erl | 67 +++++++++++++++++++ .../test/emqx_machine_app_SUITE.erl | 4 +- bin/emqx | 2 +- bin/nodetool | 14 ++-- 7 files changed, 113 insertions(+), 17 deletions(-) create mode 100644 apps/emqx_machine/src/emqx_machine.erl create mode 100644 apps/emqx_machine/src/emqx_machine_terminator.erl diff --git a/.ci/build_packages/tests.sh b/.ci/build_packages/tests.sh index 37998395d..b5ba48f2c 100755 --- a/.ci/build_packages/tests.sh +++ b/.ci/build_packages/tests.sh @@ -138,7 +138,7 @@ EOF exit 1 fi IDLE_TIME=0 - while ! curl http://localhost:8081/api/v5/status >/dev/null 2>&1; do + while ! curl http://localhost:8081/api/v5/status >/dev/null 2>&1; do if [ $IDLE_TIME -gt 10 ] then echo "emqx running error" diff --git a/apps/emqx_machine/src/emqx_machine.erl b/apps/emqx_machine/src/emqx_machine.erl new file mode 100644 index 000000000..d37d5cd4f --- /dev/null +++ b/apps/emqx_machine/src/emqx_machine.erl @@ -0,0 +1,27 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2021 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_machine). + +-export([start/0, + graceful_shutdown/0 + ]). + +start() -> + ok = emqx_machine_terminator:start(). + +graceful_shutdown() -> + emqx_machine_terminator:graceful(). diff --git a/apps/emqx_machine/src/emqx_machine_app.erl b/apps/emqx_machine/src/emqx_machine_app.erl index bc8c086c7..cf01af7a1 100644 --- a/apps/emqx_machine/src/emqx_machine_app.erl +++ b/apps/emqx_machine/src/emqx_machine_app.erl @@ -18,11 +18,9 @@ -export([ start/2 , stop/1 - , prep_stop/1 ]). -%% Shutdown and reboot --export([ shutdown/1 +-export([ stop_apps/1 , ensure_apps_started/0 ]). @@ -50,11 +48,9 @@ start(_Type, _Args) -> ok = print_vsn(), ok = start_autocluster(), + ok = emqx_machine:start(), {ok, RootSupPid}. -prep_stop(_State) -> - application:stop(emqx). - stop(_State) -> ok. @@ -96,13 +92,13 @@ load_config_files() -> ok = emqx_app:set_init_config_load_done(). start_autocluster() -> - ekka:callback(prepare, fun ?MODULE:shutdown/1), + ekka:callback(prepare, fun ?MODULE:stop_apps/1), ekka:callback(reboot, fun ?MODULE:ensure_apps_started/0), _ = ekka:autocluster(emqx), %% returns 'ok' or a pid or 'any()' as in spec ok. -shutdown(Reason) -> - ?SLOG(critical, #{msg => "stopping_apps", reason => Reason}), +stop_apps(Reason) -> + ?SLOG(info, #{msg => "stopping_apps", reason => Reason}), _ = emqx_alarm_handler:unload(), lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())). diff --git a/apps/emqx_machine/src/emqx_machine_terminator.erl b/apps/emqx_machine/src/emqx_machine_terminator.erl new file mode 100644 index 000000000..7ed5f4d99 --- /dev/null +++ b/apps/emqx_machine/src/emqx_machine_terminator.erl @@ -0,0 +1,67 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2021 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_machine_terminator). + +-export([ start/0 + , graceful/0 + , terminator_loop/0 + ]). + +-define(TERMINATOR, ?MODULE). + +%% @doc This API is called to shutdown the Erlang VM by RPC call from remote shell node. +%% The shutown of apps is delegated to a to a process instead of doing it in the RPC spawned +%% process which has a remote group leader. +start() -> + _ = spawn_link( + fun() -> + register(?TERMINATOR, self()), + terminator_loop() + end), + ok. + +%% internal use +terminator_loop() -> + receive + graceful_shutdown -> + ok = emqx_machine_app:stop_apps(normal), + exit_loop() + after + 1000 -> + %% keep looping for beam reload + ?MODULE:terminator_loop() + end. + +%% @doc Shutdown the Erlang VM. +graceful() -> + case whereis(?TERMINATOR) of + undefined -> + exit(emqx_machine_not_started); + Pid -> + Pid ! graceful_shutdown, + Ref = monitor(process, Pid), + %% NOTE: not exactly sure, but maybe there is a chance that + %% Erlang VM goes down before this receive. + %% In which case, the remote caller will get {badrpc, nodedown} + receive {'DOWN', Ref, process, Pid, _} -> ok end + end. + +%% Loop until Erlang VM exits +exit_loop() -> + init:stop(), + timer:sleep(100), + exit_loop(). diff --git a/apps/emqx_machine/test/emqx_machine_app_SUITE.erl b/apps/emqx_machine/test/emqx_machine_app_SUITE.erl index c1d666e53..e292af0ed 100644 --- a/apps/emqx_machine/test/emqx_machine_app_SUITE.erl +++ b/apps/emqx_machine/test/emqx_machine_app_SUITE.erl @@ -33,9 +33,9 @@ end_per_suite(_Config) -> emqx_ct_helpers:stop_apps([]). t_shutdown_reboot(_Config) -> - emqx_machine_app:shutdown(normal), + emqx_machine_app:stop_apps(normal), false = emqx:is_running(node()), emqx_machine_app:ensure_apps_started(), true = emqx:is_running(node()), - ok = emqx_machine_app:shutdown(for_test), + ok = emqx_machine_app:stop_apps(for_test), false = emqx:is_running(node()). diff --git a/bin/emqx b/bin/emqx index 0038cd2c7..0afa81bc5 100755 --- a/bin/emqx +++ b/bin/emqx @@ -99,7 +99,7 @@ relx_usage() { echo " don't make it permanent" ;; *) - echo "Usage: $REL_NAME {start|start_boot |ertspath|foreground|stop|restart|reboot|pid|ping|console|console_clean|console_boot |attach|remote_console|upgrade|downgrade|install|uninstall|versions|escript|ctl|rpc|rpcterms|eval|root_dir}" + echo "Usage: $REL_NAME {start|start_boot |ertspath|foreground|stop|pid|ping|console|console_clean|console_boot |attach|remote_console|upgrade|downgrade|install|uninstall|versions|escript|ctl|rpc|rpcterms|eval|root_dir}" ;; esac } diff --git a/bin/nodetool b/bin/nodetool index 373fdf97b..377ade040 100755 --- a/bin/nodetool +++ b/bin/nodetool @@ -72,9 +72,15 @@ do(Args) -> %% a "pong" io:format("pong\n"); ["stop"] -> - io:format("~p\n", [rpc:call(TargetNode, init, stop, [], 60000)]); - ["restart", "-config", ConfigFile | _RestArgs1] -> - io:format("~p\n", [rpc:call(TargetNode, emqx, restart, [ConfigFile], 60000)]); + case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of + ok -> + ok; + {badrpc, nodedown} -> + %% nodetool commands are always executed after a ping + %% which if the code gets here, it's because the target node + %% has shutdown before RPC returns. + ok + end; ["rpc", Module, Function | RpcArgs] -> case rpc:call(TargetNode, list_to_atom(Module), list_to_atom(Function), [RpcArgs], 60000) of @@ -141,7 +147,7 @@ do(Args) -> end; Other -> io:format("Other: ~p\n", [Other]), - io:format("Usage: nodetool {genconfig, chkconfig|getpid|ping|stop|restart|reboot|rpc|rpc_infinity|rpcterms|eval [Terms]} [RPC]\n") + io:format("Usage: nodetool {genconfig, chkconfig|getpid|ping|stop|rpc|rpc_infinity|rpcterms|eval [Terms]} [RPC]\n") end, net_kernel:stop().