diff --git a/apps/emqx_machine/src/emqx_machine_terminator.erl b/apps/emqx_machine/src/emqx_machine_terminator.erl index 524cf316d..a1f66deb6 100644 --- a/apps/emqx_machine/src/emqx_machine_terminator.erl +++ b/apps/emqx_machine/src/emqx_machine_terminator.erl @@ -41,7 +41,7 @@ start_link() -> is_running() -> is_pid(whereis(?TERMINATOR)). -%% @doc Send a signal to activate the terminator. +%% @doc Call terminator to stop applications, then issue init:stop() to terminat the VM. graceful() -> try _ = gen_server:call(?TERMINATOR, ?DO_IT, infinity) @@ -52,28 +52,19 @@ graceful() -> %% should issue a shutdown to be sure %% NOTE: not exit_loop here because we do not want to %% block erl_signal_server + ?ELOG("Shutdown before node is ready?~n", []), init:stop() end, ok. -%% @doc Shutdown the Erlang VM and wait until the terminator dies or the VM dies. +%% @doc Shutdown the Erlang VM and wait indefinitely. graceful_wait() -> - case whereis(?TERMINATOR) of - undefined -> - ?SLOG(warning, #{msg => "shutdown_before_boot_is_complete"}), - exit_loop(); - Pid -> - ok = graceful(), - Ref = monitor(process, Pid), - %% NOTE: not exactly sure, but maybe there is a chance that - %% Erlang VM goes down before this receive. - %% In which case, the remote caller will get {badrpc, nodedown} - receive {'DOWN', Ref, process, Pid, _} -> ok end - end. + ok = graceful(), + exit_loop(). exit_loop() -> - init:stop(), timer:sleep(100), + init:stop(), exit_loop(). init(_) -> diff --git a/bin/emqx b/bin/emqx index 436b94b18..c582f8fbf 100755 --- a/bin/emqx +++ b/bin/emqx @@ -299,6 +299,33 @@ bootstrapd() { fi } +# check if a PID is down +is_down() { + PID="$1" + if kill -s 0 "$PID" 2>/dev/null; then + return 1 + fi + return 0 +} + +wait_for() { + local WAIT_TIME + local CMD + WAIT_TIME="$1" + shift + CMD="$*" + while true; do + if $CMD >/dev/null 2>&1; then + return 0 + fi + if [ "$WAIT_TIME" -le 0 ]; then + return 1 + fi + WAIT_TIME=$((WAIT_TIME - 1)) + sleep 1 + done +} + # Use $CWD/etc/sys.config if exists if [ -z "$RELX_CONFIG_PATH" ]; then if [ -f "$RUNNER_ETC_DIR/sys.config" ]; then @@ -437,22 +464,16 @@ case "$1" in "$(relx_start_command)" WAIT_TIME=${WAIT_FOR_ERLANG:-15} - while [ "$WAIT_TIME" -gt 0 ]; do - if ! relx_nodetool "ping" >/dev/null 2>&1; then - WAIT_TIME=$((WAIT_TIME - 1)) - sleep 1 - continue - fi - sleep 1 - if relx_nodetool "ping" >/dev/null 2>&1; then - echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" - exit 0 - fi - done && echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_FOR_ERLANG:-15} seconds," - echo "see the output of '$0 console' for more information." - echo "If you want to wait longer, set the environment variable" - echo "WAIT_FOR_ERLANG to the number of seconds to wait." - exit 1 + if wait_for "$WAIT_TIME" 'relx_nodetool' 'ping'; then + echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" + exit 0 + else + echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_TIME} seconds," + echo "see the output of '$0 console' for more information." + echo "If you want to wait longer, set the environment variable" + echo "WAIT_FOR_ERLANG to the number of seconds to wait." + exit 1 + fi ;; stop) @@ -462,11 +483,11 @@ case "$1" in echoerr "emqx_graceful_shutdown_failed PID=[$PID]" exit 1 fi - while kill -s 0 "$PID" 2>/dev/null; do - sleep 1 - done - echoerr "emqx_pid_dangling_after ${max_wait} seconds PID=[$PID]" - exit 1 + WAIT_TIME=30 + if ! wait_for "$WAIT_TIME" is_down "$PID"; then + echoerr "emqx_pid_dangling_after ${WAIT_TIME} seconds PID=[$PID]" + exit 1 + fi ;; restart|reboot)