fix(code_port): improve node stop wait loop

This commit is contained in:
Shawn 2021-12-17 15:30:08 +08:00 committed by zhanghongtong
parent baf8d7d91c
commit 66e848b771
1 changed files with 55 additions and 3 deletions

View File

@ -4,6 +4,11 @@
set -e
DEBUG="${DEBUG:-0}"
if [ "$DEBUG" -eq 1 ]; then
set -x
fi
ROOT_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")"/..; pwd -P)"
# shellcheck disable=SC1090
. "$ROOT_DIR"/releases/emqx_vars
@ -299,6 +304,43 @@ generate_config() {
fi
}
# check if a PID is down
is_down() {
PID="$1"
if ps -p "$PID" >/dev/null; then
# still around
# shellcheck disable=SC2009 # this grep pattern is not a part of the progra names
if ps -p "$PID" | grep -q 'defunct'; then
# zombie state, print parent pid
parent="$(ps -o ppid= -p "$PID" | tr -d ' ')"
echo "WARN: $PID is marked <defunct>, parent:"
ps -p "$parent"
return 0
fi
return 1
fi
# it's gone
return 0
}
wait_for() {
local WAIT_TIME
local CMD
WAIT_TIME="$1"
shift
CMD="$*"
while true; do
if $CMD >/dev/null 2>&1; then
return 0
fi
if [ "$WAIT_TIME" -le 0 ]; then
return 1
fi
WAIT_TIME=$((WAIT_TIME - 1))
sleep 1
done
}
# Call bootstrapd for daemon commands like start/stop/console
bootstrapd() {
if [ -e "$RUNNER_DATA_DIR/.erlang.cookie" ]; then
@ -485,11 +527,21 @@ case "$1" in
# Wait for the node to completely stop...
PID="$(relx_get_pid)"
if ! relx_nodetool "stop"; then
echoerr "Graceful shutdown failed PID=[$PID]"
exit 1
fi
while kill -s 0 "$PID" 2>/dev/null; do
sleep 1
done
WAIT_TIME="${WAIT_FOR_ERLANG_STOP:-60}"
if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
msg="dangling after ${WAIT_TIME} seconds"
# also log to syslog
logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
# log to user console
echoerr "stop failed, $msg"
echo "ERROR: $PID is still around"
ps -p "$PID"
exit 1
fi
logger -t "${REL_NAME}[${PID}]" "STOP: OK"
;;
restart|reboot)