diff --git a/bin/emqx b/bin/emqx index feed46f62..b65b5b431 100755 --- a/bin/emqx +++ b/bin/emqx @@ -812,7 +812,7 @@ is_down() { if ps -p "$PID" >/dev/null; then # still around # shellcheck disable=SC2009 # this grep pattern is not a part of the program names - if ps -efp "$PID" | $GREP -q 'defunct'; then + if ps -fp "$PID" | $GREP -q 'defunct'; then # zombie state, print parent pid parent="$(ps -o ppid= -p "$PID" | tr -d ' ')" logwarn "$PID is marked , parent: $(ps -p "$parent")" @@ -831,7 +831,7 @@ wait_for() { shift CMD="$*" while true; do - if $CMD >/dev/null 2>&1; then + if $CMD; then return 0 fi if [ "$WAIT_TIME" -le 0 ]; then diff --git a/bin/nodetool b/bin/nodetool index a96f5f9fd..3af3bd21a 100755 --- a/bin/nodetool +++ b/bin/nodetool @@ -8,6 +8,8 @@ %% ------------------------------------------------------------------- -mode(compile). +-define(SHUTDOWN_TIMEOUT_MS, 120_000). + main(Args) -> case os:type() of {win32, nt} -> ok; @@ -85,9 +87,17 @@ do(Args) -> %% a "pong" io:format("pong\n"); ["stop"] -> - case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of + Pid = start_shutdown_status(), + Res = rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS), + true = stop_shutdown_status(Pid), + case Res of ok -> ok; + {badrpc, timeout} -> + io:format("EMQX is still shutting down, it failed to stop gracefully " + "within the configured timeout of: ~ps\n", + [erlang:convert_time_unit(?SHUTDOWN_TIMEOUT_MS, millisecond, second)]), + halt(1); {badrpc, nodedown} -> %% nodetool commands are always executed after a ping %% which if the code gets here, it's because the target node @@ -145,6 +155,18 @@ do(Args) -> end, net_kernel:stop(). +start_shutdown_status() -> + spawn_link(fun shutdown_status_loop/0). + +stop_shutdown_status(Pid) -> + true = unlink(Pid), + true = exit(Pid, stop). + +shutdown_status_loop() -> + timer:sleep(10_000), + io:format("EMQX is shutting down, please wait...\n", []), + shutdown_status_loop(). + parse_eval_args(Args) -> % shells may process args into more than one, and end up stripping % spaces, so this converts all of that to a single string to parse diff --git a/changes/ce/fix-11567.en.md b/changes/ce/fix-11567.en.md new file mode 100644 index 000000000..c19db971b --- /dev/null +++ b/changes/ce/fix-11567.en.md @@ -0,0 +1,4 @@ +Improve EMQX graceful shutdown (`emqx stop` command): +- increase timeout from 1 to 2 minutes +- print an error message if EMQX can't stop gracefully within the configured timeout +- print periodic status messages while EMQX is shutting down