From 4eb9a69df02d3de60fce3bb05e3a6d4fd7865f14 Mon Sep 17 00:00:00 2001
From: "Zaiming (Stone) Shi" <zmstone@gmail.com>
Date: Wed, 1 Mar 2023 14:12:15 +0100
Subject: [PATCH] feat(bin/emqx): diagnose boot failure

Prior to this change, when EMQX daemon mode failed to start
it's not quite easy for users to understand what went wrong.
All the know is the node did not start in time
and then instructed to boot the node in 'console' mode wishing
for some logs.
However, the node might actuay be running, causing 'console' mode
to fail with a different reason.

With this change, after a filure of daemon mode boot,
we issue a diagnosis.
1. if node can not be found from ps -ef, instruct the user
   to find information in erlang.log.N
2. if the node is found running, but not responding to pings
   instruct the user to check if the node name is
   resolvable and reachable
3. if the node is responding to pings but emqx app is not
   running, then it's likely a bug. so the user is advised
   to report a github issue.
---
 apps/emqx_machine/src/emqx_machine.erl |   5 ++
 bin/emqx                               | 106 +++++++++++++++++--------
 changes/ce/fix-10052.en.md             |  12 +++
 changes/ce/fix-10052.zh.md             |  11 +++
 4 files changed, 101 insertions(+), 33 deletions(-)
 create mode 100644 changes/ce/fix-10052.en.md
 create mode 100644 changes/ce/fix-10052.zh.md

diff --git a/apps/emqx_machine/src/emqx_machine.erl b/apps/emqx_machine/src/emqx_machine.erl
index 9dc3fdc54..243c4bb8c 100644
--- a/apps/emqx_machine/src/emqx_machine.erl
+++ b/apps/emqx_machine/src/emqx_machine.erl
@@ -19,6 +19,7 @@
 -export([
     start/0,
     graceful_shutdown/0,
+    brutal_shutdown/0,
     is_ready/0,
 
     node_status/0,
@@ -47,6 +48,10 @@ start() ->
 graceful_shutdown() ->
     emqx_machine_terminator:graceful_wait().
 
+%% only used when failed to boot
+brutal_shutdown() ->
+    init:stop().
+
 set_backtrace_depth() ->
     {ok, Depth} = application:get_env(emqx_machine, backtrace_depth),
     _ = erlang:system_flag(backtrace_depth, Depth),
diff --git a/bin/emqx b/bin/emqx
index 8c5fdf45b..14f94f359 100755
--- a/bin/emqx
+++ b/bin/emqx
@@ -451,6 +451,20 @@ call_hocon() {
         || die "call_hocon_failed: $*" $?
 }
 
+find_emqx_process() {
+    ## Find the running node from 'ps -ef'
+    ##  * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
+    ##  * The running 'remsh' and 'nodetool' processes must be excluded
+    if [ -n "${EMQX_NODE__NAME:-}" ]; then
+        # if node name is provided, filter by node name
+        # shellcheck disable=SC2009
+        ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
+    else
+        # shellcheck disable=SC2009
+        ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
+    fi
+}
+
 ## Resolve boot configs in a batch
 ## This is because starting the Erlang beam with all modules loaded
 ## and parsing HOCON config + environment variables is a non-trivial task
@@ -468,17 +482,7 @@ fi
 # Turn off debug as the ps output can be quite noisy
 set +x
 
-## Find the running node from 'ps -ef'
-##  * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
-##  * The running 'remsh' and 'nodetool' processes must be excluded
-if [ -n "${EMQX_NODE__NAME:-}" ]; then
-    # if node name is provided, filter by node name
-    # shellcheck disable=SC2009
-    PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
-else
-    # shellcheck disable=SC2009
-    PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
-fi
+PS_LINE="$(find_emqx_process)"
 logdebug "PS_LINE=$PS_LINE"
 RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)"
 [ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT"
@@ -927,6 +931,7 @@ case "$NAME" in
         NAME_TYPE='-sname'
 esac
 SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')"
+HOST_NAME="$(echo "$NAME" | awk -F'@' '{print $2}')"
 if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then
     logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
     exit 1
@@ -969,6 +974,59 @@ if [[ "$IS_BOOT_COMMAND" == 'yes' && "$(get_boot_config 'node.db_backend')" == "
     fi
 fi
 
+diagnose_boot_failure_and_die() {
+    local ps_line
+    ps_line="$(find_emqx_process)"
+    if [ -z "$ps_line" ]; then
+        echo "Find more information in the latest log file: ${RUNNER_LOG_DIR}/erlang.log.*"
+        exit 1
+    fi
+    if ! relx_nodetool "ping" > /dev/null; then
+        logerr "$NAME seems to be running, but not responding to pings."
+        echo "Make sure '$HOST_NAME' is a resolvable and reachable hostname."
+        pipe_shutdown
+        exit 2
+    fi
+    if ! relx_nodetool 'eval' 'true = emqx:is_running()' > /dev/null; then
+        logerr "$NAME node is started, but failed to complete the boot sequence in time."
+        echo "Please collect the logs in ${RUNNER_LOG_DIR} and report a bug to EMQX team at https://github.com/emqx/emqx/issues/new/choose"
+        pipe_shutdown
+        exit 3
+    fi
+}
+
+## Only works when started in daemon mode
+pipe_shutdown() {
+    if [ -d "$PIPE_DIR" ]; then
+        echo "Shutting down $NAME from to_erl pipe."
+        ## can not evaluate init:stop() or erlang:halt() because the shell is restricted
+        echo 'emqx_machine:brutal_shutdown().' | "$BINDIR/to_erl" "$PIPE_DIR"
+    fi
+}
+
+## Call nodetool to stop EMQX
+nodetool_shutdown() {
+    # Wait for the node to completely stop...
+    PID="$(relx_get_pid)"
+    if ! relx_nodetool "stop"; then
+        die "Graceful shutdown failed PID=[$PID]"
+    fi
+    WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
+    if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
+        msg="dangling after ${WAIT_TIME} seconds"
+        # also log to syslog
+        logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
+        # log to user console
+        set +x
+        logerr "Stop failed, $msg"
+        echo "ERROR: $PID is still around"
+        ps -p "$PID"
+        exit 1
+    fi
+    echo "ok"
+    logger -t "${REL_NAME}[${PID}]" "STOP: OK"
+}
+
 cd "$RUNNER_ROOT_DIR"
 
 case "${COMMAND}" in
@@ -1014,33 +1072,15 @@ case "${COMMAND}" in
             echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
             exit 0
         else
-            echo "$EMQX_DESCRIPTION $REL_VSN failed to start in ${WAIT_TIME} seconds."
-            echo "Please find more information in erlang.log.N"
-            echo "Or run 'env DEBUG=1 $0 console' to have logs printed to console."
-            exit 1
+            logerr "${EMQX_DESCRIPTION} ${REL_VSN} using node name '${NAME}' failed ${WAIT_TIME} probes."
+            diagnose_boot_failure_and_die
         fi
         ;;
 
     stop)
-        # Wait for the node to completely stop...
-        PID="$(relx_get_pid)"
-        if ! relx_nodetool "stop"; then
-            die "Graceful shutdown failed PID=[$PID]"
+        if ! nodetool_shutdown; then
+            pipe_shutdown
         fi
-        WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
-        if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
-            msg="dangling after ${WAIT_TIME} seconds"
-            # also log to syslog
-            logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
-            # log to user console
-            set +x
-            logerr "Stop failed, $msg"
-            echo "ERROR: $PID is still around"
-            ps -p "$PID"
-            exit 1
-        fi
-        echo "ok"
-        logger -t "${REL_NAME}[${PID}]" "STOP: OK"
         ;;
 
     pid)
diff --git a/changes/ce/fix-10052.en.md b/changes/ce/fix-10052.en.md
new file mode 100644
index 000000000..f83c4d40c
--- /dev/null
+++ b/changes/ce/fix-10052.en.md
@@ -0,0 +1,12 @@
+Improve daemon mode startup failure logs.
+
+Before this change, it was difficult for users to understand the reason for EMQX 'start' command failed to boot the node.
+The only information they received was that the node did not start within the expected time frame,
+and they were instructed to boot the node with 'console' command in the hope of obtaining some logs.
+However, the node might actually be running, which could cause 'console' mode to fail for a different reason.
+
+With this new change, when daemon mode fails to boot, a diagnosis is issued. Here are the possible scenarios:
+
+* If the node cannot be found from `ps -ef`, the user is instructed to find information in log files `erlang.log.*`.
+* If the node is found to be running but not responding to pings, the user is advised to check if the host name is resolvable and reachable.
+* If the node is responding to pings, but the EMQX app is not running, it is likely a bug. In this case, the user is advised to report a Github issue.
diff --git a/changes/ce/fix-10052.zh.md b/changes/ce/fix-10052.zh.md
new file mode 100644
index 000000000..1c2eff342
--- /dev/null
+++ b/changes/ce/fix-10052.zh.md
@@ -0,0 +1,11 @@
+优化 EMQX daemon 模式启动启动失败的日志。
+
+在进行此更改之前，当 EMQX 用 `start` 命令启动失败时，用户很难理解出错的原因。
+所知道的仅仅是节点未能在预期时间内启动，然后被指示以 `console` 式引导节点以获取一些日志。
+然而，节点实际上可能正在运行，这可能会导致 `console` 模式因不同的原因而失败。
+
+此次修复后，启动脚本会发出诊断：
+
+* 如果无法从 `ps -ef` 中找到节点，则指示用户在 `erlang.log.*` 中查找信息。
+* 如果发现节点正在运行但不响应 ping，则建议用户检查节点主机名是否有效并可达。
+* 如果节点响应 ping 但 EMQX 应用程序未运行，则很可能是一个错误。在这种情况下，建议用户报告一个Github issue。