From 9de9631d6b63095ade736036b4b94e2d689d9a40 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Tue, 5 Sep 2023 19:29:05 +0300 Subject: [PATCH 1/4] fix(nodetool): increase graceful stop timeout, handle and report `{badrpc, timeout}` error --- bin/nodetool | 9 ++++++++- changes/ce/fix-11567.en.md | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 changes/ce/fix-11567.en.md diff --git a/bin/nodetool b/bin/nodetool index a96f5f9fd..511ef8e28 100755 --- a/bin/nodetool +++ b/bin/nodetool @@ -8,6 +8,8 @@ %% ------------------------------------------------------------------- -mode(compile). +-define(SHUTDOWN_TIMEOUT_MS, 120_000). + main(Args) -> case os:type() of {win32, nt} -> ok; @@ -85,9 +87,14 @@ do(Args) -> %% a "pong" io:format("pong\n"); ["stop"] -> - case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of + case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS) of ok -> ok; + {badrpc, timeout} -> + io:format("EMQX is still shutting down, it failed to stop gracefully " + "within the configured timeout of: ~ps\n", + [erlang:convert_time_unit(?SHUTDOWN_TIMEOUT_MS, millisecond, second)]), + halt(1); {badrpc, nodedown} -> %% nodetool commands are always executed after a ping %% which if the code gets here, it's because the target node diff --git a/changes/ce/fix-11567.en.md b/changes/ce/fix-11567.en.md new file mode 100644 index 000000000..026674f69 --- /dev/null +++ b/changes/ce/fix-11567.en.md @@ -0,0 +1,3 @@ +Improve EMQX graceful shutdown (`emqx stop` command): +- increase timeout from 1 to 2 minutes +- print an error message if EMQX can't stop gracefully within the configured timeout From 9b3f88aeeb25844decfa5d06fccf5c0e1e8c2e9b Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 6 Sep 2023 12:33:07 +0300 Subject: [PATCH 2/4] feat(nodetool): print shutdown status messages while EMQX is stopping --- bin/nodetool | 17 ++++++++++++++++- changes/ce/fix-11567.en.md | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bin/nodetool b/bin/nodetool index 511ef8e28..3af3bd21a 100755 --- a/bin/nodetool +++ b/bin/nodetool @@ -87,7 +87,10 @@ do(Args) -> %% a "pong" io:format("pong\n"); ["stop"] -> - case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS) of + Pid = start_shutdown_status(), + Res = rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS), + true = stop_shutdown_status(Pid), + case Res of ok -> ok; {badrpc, timeout} -> @@ -152,6 +155,18 @@ do(Args) -> end, net_kernel:stop(). +start_shutdown_status() -> + spawn_link(fun shutdown_status_loop/0). + +stop_shutdown_status(Pid) -> + true = unlink(Pid), + true = exit(Pid, stop). + +shutdown_status_loop() -> + timer:sleep(10_000), + io:format("EMQX is shutting down, please wait...\n", []), + shutdown_status_loop(). + parse_eval_args(Args) -> % shells may process args into more than one, and end up stripping % spaces, so this converts all of that to a single string to parse diff --git a/changes/ce/fix-11567.en.md b/changes/ce/fix-11567.en.md index 026674f69..c19db971b 100644 --- a/changes/ce/fix-11567.en.md +++ b/changes/ce/fix-11567.en.md @@ -1,3 +1,4 @@ Improve EMQX graceful shutdown (`emqx stop` command): - increase timeout from 1 to 2 minutes - print an error message if EMQX can't stop gracefully within the configured timeout +- print periodic status messages while EMQX is shutting down From 6dd9e54ab836188be81a99c7bfd6ca47a1165510 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 8 Sep 2023 13:43:12 +0300 Subject: [PATCH 3/4] fix(bin/emqx): remove `ps` `-e` opt, as it lists all processes regardless of `-p ` --- bin/emqx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/emqx b/bin/emqx index feed46f62..a51ffcfbb 100755 --- a/bin/emqx +++ b/bin/emqx @@ -812,7 +812,7 @@ is_down() { if ps -p "$PID" >/dev/null; then # still around # shellcheck disable=SC2009 # this grep pattern is not a part of the program names - if ps -efp "$PID" | $GREP -q 'defunct'; then + if ps -fp "$PID" | $GREP -q 'defunct'; then # zombie state, print parent pid parent="$(ps -o ppid= -p "$PID" | tr -d ' ')" logwarn "$PID is marked , parent: $(ps -p "$parent")" From f790690d8b8d91dac631dfc7ffb3625686c7197e Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 8 Sep 2023 14:22:02 +0300 Subject: [PATCH 4/4] fix(bin/emqx): don't suppress `wait_for` command output, as it can print warning messages --- bin/emqx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/emqx b/bin/emqx index a51ffcfbb..b65b5b431 100755 --- a/bin/emqx +++ b/bin/emqx @@ -831,7 +831,7 @@ wait_for() { shift CMD="$*" while true; do - if $CMD >/dev/null 2>&1; then + if $CMD; then return 0 fi if [ "$WAIT_TIME" -le 0 ]; then