feat(bin/emqx): diagnose boot failure

Prior to this change, when EMQX daemon mode failed to start
it's not quite easy for users to understand what went wrong.
All the know is the node did not start in time
and then instructed to boot the node in 'console' mode wishing
for some logs.
However, the node might actuay be running, causing 'console' mode
to fail with a different reason.

With this change, after a filure of daemon mode boot,
we issue a diagnosis.
1. if node can not be found from ps -ef, instruct the user
   to find information in erlang.log.N
2. if the node is found running, but not responding to pings
   instruct the user to check if the node name is
   resolvable and reachable
3. if the node is responding to pings but emqx app is not
   running, then it's likely a bug. so the user is advised
   to report a github issue.
This commit is contained in:
Zaiming (Stone) Shi 2023-03-01 14:12:15 +01:00
parent c4c0c9cc7e
commit 4eb9a69df0
4 changed files with 101 additions and 33 deletions

View File

@ -19,6 +19,7 @@
-export([
start/0,
graceful_shutdown/0,
brutal_shutdown/0,
is_ready/0,
node_status/0,
@ -47,6 +48,10 @@ start() ->
graceful_shutdown() ->
emqx_machine_terminator:graceful_wait().
%% only used when failed to boot
brutal_shutdown() ->
init:stop().
set_backtrace_depth() ->
{ok, Depth} = application:get_env(emqx_machine, backtrace_depth),
_ = erlang:system_flag(backtrace_depth, Depth),

106
bin/emqx
View File

@ -451,6 +451,20 @@ call_hocon() {
|| die "call_hocon_failed: $*" $?
}
find_emqx_process() {
## Find the running node from 'ps -ef'
## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
## * The running 'remsh' and 'nodetool' processes must be excluded
if [ -n "${EMQX_NODE__NAME:-}" ]; then
# if node name is provided, filter by node name
# shellcheck disable=SC2009
ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
else
# shellcheck disable=SC2009
ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
fi
}
## Resolve boot configs in a batch
## This is because starting the Erlang beam with all modules loaded
## and parsing HOCON config + environment variables is a non-trivial task
@ -468,17 +482,7 @@ fi
# Turn off debug as the ps output can be quite noisy
set +x
## Find the running node from 'ps -ef'
## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
## * The running 'remsh' and 'nodetool' processes must be excluded
if [ -n "${EMQX_NODE__NAME:-}" ]; then
# if node name is provided, filter by node name
# shellcheck disable=SC2009
PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
else
# shellcheck disable=SC2009
PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
fi
PS_LINE="$(find_emqx_process)"
logdebug "PS_LINE=$PS_LINE"
RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)"
[ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT"
@ -927,6 +931,7 @@ case "$NAME" in
NAME_TYPE='-sname'
esac
SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')"
HOST_NAME="$(echo "$NAME" | awk -F'@' '{print $2}')"
if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then
logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
exit 1
@ -969,6 +974,59 @@ if [[ "$IS_BOOT_COMMAND" == 'yes' && "$(get_boot_config 'node.db_backend')" == "
fi
fi
diagnose_boot_failure_and_die() {
local ps_line
ps_line="$(find_emqx_process)"
if [ -z "$ps_line" ]; then
echo "Find more information in the latest log file: ${RUNNER_LOG_DIR}/erlang.log.*"
exit 1
fi
if ! relx_nodetool "ping" > /dev/null; then
logerr "$NAME seems to be running, but not responding to pings."
echo "Make sure '$HOST_NAME' is a resolvable and reachable hostname."
pipe_shutdown
exit 2
fi
if ! relx_nodetool 'eval' 'true = emqx:is_running()' > /dev/null; then
logerr "$NAME node is started, but failed to complete the boot sequence in time."
echo "Please collect the logs in ${RUNNER_LOG_DIR} and report a bug to EMQX team at https://github.com/emqx/emqx/issues/new/choose"
pipe_shutdown
exit 3
fi
}
## Only works when started in daemon mode
pipe_shutdown() {
if [ -d "$PIPE_DIR" ]; then
echo "Shutting down $NAME from to_erl pipe."
## can not evaluate init:stop() or erlang:halt() because the shell is restricted
echo 'emqx_machine:brutal_shutdown().' | "$BINDIR/to_erl" "$PIPE_DIR"
fi
}
## Call nodetool to stop EMQX
nodetool_shutdown() {
# Wait for the node to completely stop...
PID="$(relx_get_pid)"
if ! relx_nodetool "stop"; then
die "Graceful shutdown failed PID=[$PID]"
fi
WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
msg="dangling after ${WAIT_TIME} seconds"
# also log to syslog
logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
# log to user console
set +x
logerr "Stop failed, $msg"
echo "ERROR: $PID is still around"
ps -p "$PID"
exit 1
fi
echo "ok"
logger -t "${REL_NAME}[${PID}]" "STOP: OK"
}
cd "$RUNNER_ROOT_DIR"
case "${COMMAND}" in
@ -1014,33 +1072,15 @@ case "${COMMAND}" in
echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
exit 0
else
echo "$EMQX_DESCRIPTION $REL_VSN failed to start in ${WAIT_TIME} seconds."
echo "Please find more information in erlang.log.N"
echo "Or run 'env DEBUG=1 $0 console' to have logs printed to console."
exit 1
logerr "${EMQX_DESCRIPTION} ${REL_VSN} using node name '${NAME}' failed ${WAIT_TIME} probes."
diagnose_boot_failure_and_die
fi
;;
stop)
# Wait for the node to completely stop...
PID="$(relx_get_pid)"
if ! relx_nodetool "stop"; then
die "Graceful shutdown failed PID=[$PID]"
if ! nodetool_shutdown; then
pipe_shutdown
fi
WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
msg="dangling after ${WAIT_TIME} seconds"
# also log to syslog
logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
# log to user console
set +x
logerr "Stop failed, $msg"
echo "ERROR: $PID is still around"
ps -p "$PID"
exit 1
fi
echo "ok"
logger -t "${REL_NAME}[${PID}]" "STOP: OK"
;;
pid)

View File

@ -0,0 +1,12 @@
Improve daemon mode startup failure logs.
Before this change, it was difficult for users to understand the reason for EMQX 'start' command failed to boot the node.
The only information they received was that the node did not start within the expected time frame,
and they were instructed to boot the node with 'console' command in the hope of obtaining some logs.
However, the node might actually be running, which could cause 'console' mode to fail for a different reason.
With this new change, when daemon mode fails to boot, a diagnosis is issued. Here are the possible scenarios:
* If the node cannot be found from `ps -ef`, the user is instructed to find information in log files `erlang.log.*`.
* If the node is found to be running but not responding to pings, the user is advised to check if the host name is resolvable and reachable.
* If the node is responding to pings, but the EMQX app is not running, it is likely a bug. In this case, the user is advised to report a Github issue.

View File

@ -0,0 +1,11 @@
优化 EMQX daemon 模式启动启动失败的日志。
在进行此更改之前,当 EMQX 用 `start` 命令启动失败时,用户很难理解出错的原因。
所知道的仅仅是节点未能在预期时间内启动,然后被指示以 `console` 式引导节点以获取一些日志。
然而,节点实际上可能正在运行,这可能会导致 `console` 模式因不同的原因而失败。
此次修复后,启动脚本会发出诊断:
* 如果无法从 `ps -ef` 中找到节点,则指示用户在 `erlang.log.*` 中查找信息。
* 如果发现节点正在运行但不响应 ping则建议用户检查节点主机名是否有效并可达。
* 如果节点响应 ping 但 EMQX 应用程序未运行则很可能是一个错误。在这种情况下建议用户报告一个Github issue。