Merge pull request #11567 from SergeTupchiy/EMQX-10835-increase-graceful-stop-timeout

fix(nodetool): increase graceful stop timeout
This commit is contained in:
SergeTupchiy 2023-09-13 13:22:39 +03:00 committed by GitHub
commit cf334d5542
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 29 additions and 3 deletions

View File

@ -812,7 +812,7 @@ is_down() {
if ps -p "$PID" >/dev/null; then if ps -p "$PID" >/dev/null; then
# still around # still around
# shellcheck disable=SC2009 # this grep pattern is not a part of the program names # shellcheck disable=SC2009 # this grep pattern is not a part of the program names
if ps -efp "$PID" | $GREP -q 'defunct'; then if ps -fp "$PID" | $GREP -q 'defunct'; then
# zombie state, print parent pid # zombie state, print parent pid
parent="$(ps -o ppid= -p "$PID" | tr -d ' ')" parent="$(ps -o ppid= -p "$PID" | tr -d ' ')"
logwarn "$PID is marked <defunct>, parent: $(ps -p "$parent")" logwarn "$PID is marked <defunct>, parent: $(ps -p "$parent")"
@ -831,7 +831,7 @@ wait_for() {
shift shift
CMD="$*" CMD="$*"
while true; do while true; do
if $CMD >/dev/null 2>&1; then if $CMD; then
return 0 return 0
fi fi
if [ "$WAIT_TIME" -le 0 ]; then if [ "$WAIT_TIME" -le 0 ]; then

View File

@ -8,6 +8,8 @@
%% ------------------------------------------------------------------- %% -------------------------------------------------------------------
-mode(compile). -mode(compile).
-define(SHUTDOWN_TIMEOUT_MS, 120_000).
main(Args) -> main(Args) ->
case os:type() of case os:type() of
{win32, nt} -> ok; {win32, nt} -> ok;
@ -85,9 +87,17 @@ do(Args) ->
%% a "pong" %% a "pong"
io:format("pong\n"); io:format("pong\n");
["stop"] -> ["stop"] ->
case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of Pid = start_shutdown_status(),
Res = rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS),
true = stop_shutdown_status(Pid),
case Res of
ok -> ok ->
ok; ok;
{badrpc, timeout} ->
io:format("EMQX is still shutting down, it failed to stop gracefully "
"within the configured timeout of: ~ps\n",
[erlang:convert_time_unit(?SHUTDOWN_TIMEOUT_MS, millisecond, second)]),
halt(1);
{badrpc, nodedown} -> {badrpc, nodedown} ->
%% nodetool commands are always executed after a ping %% nodetool commands are always executed after a ping
%% which if the code gets here, it's because the target node %% which if the code gets here, it's because the target node
@ -145,6 +155,18 @@ do(Args) ->
end, end,
net_kernel:stop(). net_kernel:stop().
start_shutdown_status() ->
spawn_link(fun shutdown_status_loop/0).
stop_shutdown_status(Pid) ->
true = unlink(Pid),
true = exit(Pid, stop).
shutdown_status_loop() ->
timer:sleep(10_000),
io:format("EMQX is shutting down, please wait...\n", []),
shutdown_status_loop().
parse_eval_args(Args) -> parse_eval_args(Args) ->
% shells may process args into more than one, and end up stripping % shells may process args into more than one, and end up stripping
% spaces, so this converts all of that to a single string to parse % spaces, so this converts all of that to a single string to parse

View File

@ -0,0 +1,4 @@
Improve EMQX graceful shutdown (`emqx stop` command):
- increase timeout from 1 to 2 minutes
- print an error message if EMQX can't stop gracefully within the configured timeout
- print periodic status messages while EMQX is shutting down