fix: improve node stop wait loop

This commit is contained in:
Zaiming Shi 2021-08-04 13:05:50 +02:00
parent 9f3063a823
commit 668ecbe97b
2 changed files with 48 additions and 36 deletions

View File

@ -41,7 +41,7 @@ start_link() ->
is_running() -> is_pid(whereis(?TERMINATOR)). is_running() -> is_pid(whereis(?TERMINATOR)).
%% @doc Send a signal to activate the terminator. %% @doc Call terminator to stop applications, then issue init:stop() to terminat the VM.
graceful() -> graceful() ->
try try
_ = gen_server:call(?TERMINATOR, ?DO_IT, infinity) _ = gen_server:call(?TERMINATOR, ?DO_IT, infinity)
@ -52,28 +52,19 @@ graceful() ->
%% should issue a shutdown to be sure %% should issue a shutdown to be sure
%% NOTE: not exit_loop here because we do not want to %% NOTE: not exit_loop here because we do not want to
%% block erl_signal_server %% block erl_signal_server
?ELOG("Shutdown before node is ready?~n", []),
init:stop() init:stop()
end, end,
ok. ok.
%% @doc Shutdown the Erlang VM and wait until the terminator dies or the VM dies. %% @doc Shutdown the Erlang VM and wait indefinitely.
graceful_wait() -> graceful_wait() ->
case whereis(?TERMINATOR) of ok = graceful(),
undefined -> exit_loop().
?SLOG(warning, #{msg => "shutdown_before_boot_is_complete"}),
exit_loop();
Pid ->
ok = graceful(),
Ref = monitor(process, Pid),
%% NOTE: not exactly sure, but maybe there is a chance that
%% Erlang VM goes down before this receive.
%% In which case, the remote caller will get {badrpc, nodedown}
receive {'DOWN', Ref, process, Pid, _} -> ok end
end.
exit_loop() -> exit_loop() ->
init:stop(),
timer:sleep(100), timer:sleep(100),
init:stop(),
exit_loop(). exit_loop().
init(_) -> init(_) ->

View File

@ -299,6 +299,33 @@ bootstrapd() {
fi fi
} }
# check if a PID is down
is_down() {
PID="$1"
if kill -s 0 "$PID" 2>/dev/null; then
return 1
fi
return 0
}
wait_for() {
local WAIT_TIME
local CMD
WAIT_TIME="$1"
shift
CMD="$*"
while true; do
if $CMD >/dev/null 2>&1; then
return 0
fi
if [ "$WAIT_TIME" -le 0 ]; then
return 1
fi
WAIT_TIME=$((WAIT_TIME - 1))
sleep 1
done
}
# Use $CWD/etc/sys.config if exists # Use $CWD/etc/sys.config if exists
if [ -z "$RELX_CONFIG_PATH" ]; then if [ -z "$RELX_CONFIG_PATH" ]; then
if [ -f "$RUNNER_ETC_DIR/sys.config" ]; then if [ -f "$RUNNER_ETC_DIR/sys.config" ]; then
@ -437,22 +464,16 @@ case "$1" in
"$(relx_start_command)" "$(relx_start_command)"
WAIT_TIME=${WAIT_FOR_ERLANG:-15} WAIT_TIME=${WAIT_FOR_ERLANG:-15}
while [ "$WAIT_TIME" -gt 0 ]; do if wait_for "$WAIT_TIME" 'relx_nodetool' 'ping'; then
if ! relx_nodetool "ping" >/dev/null 2>&1; then echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
WAIT_TIME=$((WAIT_TIME - 1)) exit 0
sleep 1 else
continue echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_TIME} seconds,"
fi echo "see the output of '$0 console' for more information."
sleep 1 echo "If you want to wait longer, set the environment variable"
if relx_nodetool "ping" >/dev/null 2>&1; then echo "WAIT_FOR_ERLANG to the number of seconds to wait."
echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" exit 1
exit 0 fi
fi
done && echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_FOR_ERLANG:-15} seconds,"
echo "see the output of '$0 console' for more information."
echo "If you want to wait longer, set the environment variable"
echo "WAIT_FOR_ERLANG to the number of seconds to wait."
exit 1
;; ;;
stop) stop)
@ -462,11 +483,11 @@ case "$1" in
echoerr "emqx_graceful_shutdown_failed PID=[$PID]" echoerr "emqx_graceful_shutdown_failed PID=[$PID]"
exit 1 exit 1
fi fi
while kill -s 0 "$PID" 2>/dev/null; do WAIT_TIME=30
sleep 1 if ! wait_for "$WAIT_TIME" is_down "$PID"; then
done echoerr "emqx_pid_dangling_after ${WAIT_TIME} seconds PID=[$PID]"
echoerr "emqx_pid_dangling_after ${max_wait} seconds PID=[$PID]" exit 1
exit 1 fi
;; ;;
restart|reboot) restart|reboot)