diff --git a/apps/emqx_machine/src/emqx_machine.erl b/apps/emqx_machine/src/emqx_machine.erl index 9dc3fdc54..243c4bb8c 100644 --- a/apps/emqx_machine/src/emqx_machine.erl +++ b/apps/emqx_machine/src/emqx_machine.erl @@ -19,6 +19,7 @@ -export([ start/0, graceful_shutdown/0, + brutal_shutdown/0, is_ready/0, node_status/0, @@ -47,6 +48,10 @@ start() -> graceful_shutdown() -> emqx_machine_terminator:graceful_wait(). +%% only used when failed to boot +brutal_shutdown() -> + init:stop(). + set_backtrace_depth() -> {ok, Depth} = application:get_env(emqx_machine, backtrace_depth), _ = erlang:system_flag(backtrace_depth, Depth), diff --git a/bin/emqx b/bin/emqx index 8c5fdf45b..14f94f359 100755 --- a/bin/emqx +++ b/bin/emqx @@ -451,6 +451,20 @@ call_hocon() { || die "call_hocon_failed: $*" $? } +find_emqx_process() { + ## Find the running node from 'ps -ef' + ## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself + ## * The running 'remsh' and 'nodetool' processes must be excluded + if [ -n "${EMQX_NODE__NAME:-}" ]; then + # if node name is provided, filter by node name + # shellcheck disable=SC2009 + ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true + else + # shellcheck disable=SC2009 + ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true + fi +} + ## Resolve boot configs in a batch ## This is because starting the Erlang beam with all modules loaded ## and parsing HOCON config + environment variables is a non-trivial task @@ -468,17 +482,7 @@ fi # Turn off debug as the ps output can be quite noisy set +x -## Find the running node from 'ps -ef' -## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself -## * The running 'remsh' and 'nodetool' processes must be excluded -if [ -n "${EMQX_NODE__NAME:-}" ]; then - # if node name is provided, filter by node name - # shellcheck disable=SC2009 - PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)" -else - # shellcheck disable=SC2009 - PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)" -fi +PS_LINE="$(find_emqx_process)" logdebug "PS_LINE=$PS_LINE" RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)" [ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT" @@ -927,6 +931,7 @@ case "$NAME" in NAME_TYPE='-sname' esac SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')" +HOST_NAME="$(echo "$NAME" | awk -F'@' '{print $2}')" if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'." exit 1 @@ -969,6 +974,59 @@ if [[ "$IS_BOOT_COMMAND" == 'yes' && "$(get_boot_config 'node.db_backend')" == " fi fi +diagnose_boot_failure_and_die() { + local ps_line + ps_line="$(find_emqx_process)" + if [ -z "$ps_line" ]; then + echo "Find more information in the latest log file: ${RUNNER_LOG_DIR}/erlang.log.*" + exit 1 + fi + if ! relx_nodetool "ping" > /dev/null; then + logerr "$NAME seems to be running, but not responding to pings." + echo "Make sure '$HOST_NAME' is a resolvable and reachable hostname." + pipe_shutdown + exit 2 + fi + if ! relx_nodetool 'eval' 'true = emqx:is_running()' > /dev/null; then + logerr "$NAME node is started, but failed to complete the boot sequence in time." + echo "Please collect the logs in ${RUNNER_LOG_DIR} and report a bug to EMQX team at https://github.com/emqx/emqx/issues/new/choose" + pipe_shutdown + exit 3 + fi +} + +## Only works when started in daemon mode +pipe_shutdown() { + if [ -d "$PIPE_DIR" ]; then + echo "Shutting down $NAME from to_erl pipe." + ## can not evaluate init:stop() or erlang:halt() because the shell is restricted + echo 'emqx_machine:brutal_shutdown().' | "$BINDIR/to_erl" "$PIPE_DIR" + fi +} + +## Call nodetool to stop EMQX +nodetool_shutdown() { + # Wait for the node to completely stop... + PID="$(relx_get_pid)" + if ! relx_nodetool "stop"; then + die "Graceful shutdown failed PID=[$PID]" + fi + WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}" + if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then + msg="dangling after ${WAIT_TIME} seconds" + # also log to syslog + logger -t "${REL_NAME}[${PID}]" "STOP: $msg" + # log to user console + set +x + logerr "Stop failed, $msg" + echo "ERROR: $PID is still around" + ps -p "$PID" + exit 1 + fi + echo "ok" + logger -t "${REL_NAME}[${PID}]" "STOP: OK" +} + cd "$RUNNER_ROOT_DIR" case "${COMMAND}" in @@ -1014,33 +1072,15 @@ case "${COMMAND}" in echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" exit 0 else - echo "$EMQX_DESCRIPTION $REL_VSN failed to start in ${WAIT_TIME} seconds." - echo "Please find more information in erlang.log.N" - echo "Or run 'env DEBUG=1 $0 console' to have logs printed to console." - exit 1 + logerr "${EMQX_DESCRIPTION} ${REL_VSN} using node name '${NAME}' failed ${WAIT_TIME} probes." + diagnose_boot_failure_and_die fi ;; stop) - # Wait for the node to completely stop... - PID="$(relx_get_pid)" - if ! relx_nodetool "stop"; then - die "Graceful shutdown failed PID=[$PID]" + if ! nodetool_shutdown; then + pipe_shutdown fi - WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}" - if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then - msg="dangling after ${WAIT_TIME} seconds" - # also log to syslog - logger -t "${REL_NAME}[${PID}]" "STOP: $msg" - # log to user console - set +x - logerr "Stop failed, $msg" - echo "ERROR: $PID is still around" - ps -p "$PID" - exit 1 - fi - echo "ok" - logger -t "${REL_NAME}[${PID}]" "STOP: OK" ;; pid) diff --git a/changes/ce/fix-10052.en.md b/changes/ce/fix-10052.en.md new file mode 100644 index 000000000..f83c4d40c --- /dev/null +++ b/changes/ce/fix-10052.en.md @@ -0,0 +1,12 @@ +Improve daemon mode startup failure logs. + +Before this change, it was difficult for users to understand the reason for EMQX 'start' command failed to boot the node. +The only information they received was that the node did not start within the expected time frame, +and they were instructed to boot the node with 'console' command in the hope of obtaining some logs. +However, the node might actually be running, which could cause 'console' mode to fail for a different reason. + +With this new change, when daemon mode fails to boot, a diagnosis is issued. Here are the possible scenarios: + +* If the node cannot be found from `ps -ef`, the user is instructed to find information in log files `erlang.log.*`. +* If the node is found to be running but not responding to pings, the user is advised to check if the host name is resolvable and reachable. +* If the node is responding to pings, but the EMQX app is not running, it is likely a bug. In this case, the user is advised to report a Github issue. diff --git a/changes/ce/fix-10052.zh.md b/changes/ce/fix-10052.zh.md new file mode 100644 index 000000000..1c2eff342 --- /dev/null +++ b/changes/ce/fix-10052.zh.md @@ -0,0 +1,11 @@ +优化 EMQX daemon 模式启动启动失败的日志。 + +在进行此更改之前,当 EMQX 用 `start` 命令启动失败时,用户很难理解出错的原因。 +所知道的仅仅是节点未能在预期时间内启动,然后被指示以 `console` 式引导节点以获取一些日志。 +然而,节点实际上可能正在运行,这可能会导致 `console` 模式因不同的原因而失败。 + +此次修复后,启动脚本会发出诊断: + +* 如果无法从 `ps -ef` 中找到节点,则指示用户在 `erlang.log.*` 中查找信息。 +* 如果发现节点正在运行但不响应 ping,则建议用户检查节点主机名是否有效并可达。 +* 如果节点响应 ping 但 EMQX 应用程序未运行,则很可能是一个错误。在这种情况下,建议用户报告一个Github issue。