From 66e848b771f22103c667a841ec966b745c368835 Mon Sep 17 00:00:00 2001 From: Shawn <506895667@qq.com> Date: Fri, 17 Dec 2021 15:30:08 +0800 Subject: [PATCH] fix(code_port): improve node stop wait loop --- bin/emqx | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/bin/emqx b/bin/emqx index 6a80f4d8d..be18a85e7 100755 --- a/bin/emqx +++ b/bin/emqx @@ -4,6 +4,11 @@ set -e +DEBUG="${DEBUG:-0}" +if [ "$DEBUG" -eq 1 ]; then + set -x +fi + ROOT_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")"/..; pwd -P)" # shellcheck disable=SC1090 . "$ROOT_DIR"/releases/emqx_vars @@ -299,6 +304,43 @@ generate_config() { fi } +# check if a PID is down +is_down() { + PID="$1" + if ps -p "$PID" >/dev/null; then + # still around + # shellcheck disable=SC2009 # this grep pattern is not a part of the progra names + if ps -p "$PID" | grep -q 'defunct'; then + # zombie state, print parent pid + parent="$(ps -o ppid= -p "$PID" | tr -d ' ')" + echo "WARN: $PID is marked , parent:" + ps -p "$parent" + return 0 + fi + return 1 + fi + # it's gone + return 0 +} + +wait_for() { + local WAIT_TIME + local CMD + WAIT_TIME="$1" + shift + CMD="$*" + while true; do + if $CMD >/dev/null 2>&1; then + return 0 + fi + if [ "$WAIT_TIME" -le 0 ]; then + return 1 + fi + WAIT_TIME=$((WAIT_TIME - 1)) + sleep 1 + done +} + # Call bootstrapd for daemon commands like start/stop/console bootstrapd() { if [ -e "$RUNNER_DATA_DIR/.erlang.cookie" ]; then @@ -485,11 +527,21 @@ case "$1" in # Wait for the node to completely stop... PID="$(relx_get_pid)" if ! relx_nodetool "stop"; then + echoerr "Graceful shutdown failed PID=[$PID]" exit 1 fi - while kill -s 0 "$PID" 2>/dev/null; do - sleep 1 - done + WAIT_TIME="${WAIT_FOR_ERLANG_STOP:-60}" + if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then + msg="dangling after ${WAIT_TIME} seconds" + # also log to syslog + logger -t "${REL_NAME}[${PID}]" "STOP: $msg" + # log to user console + echoerr "stop failed, $msg" + echo "ERROR: $PID is still around" + ps -p "$PID" + exit 1 + fi + logger -t "${REL_NAME}[${PID}]" "STOP: OK" ;; restart|reboot)