From 562ce1a81b9664eeada471f303ba5f83c9470409 Mon Sep 17 00:00:00 2001 From: Zaiming Shi Date: Wed, 4 Aug 2021 17:38:01 +0200 Subject: [PATCH 1/5] feat: bin/emqx pass down DEBUG if su exec --- bin/emqx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/emqx b/bin/emqx index 0afa81bc5..af2a7f433 100755 --- a/bin/emqx +++ b/bin/emqx @@ -5,7 +5,8 @@ set -e set -o pipefail -if [ -n "$DEBUG" ]; then +DEBUG="${DEBUG:-0}" +if [ "$DEBUG" -eq 1 ]; then set -x fi @@ -113,7 +114,7 @@ check_user() { echo "You need to be root or use sudo to run this command" exit 1 fi - CMD="\"$RUNNER_SCRIPT\" " + CMD="DEBUG=$DEBUG \"$RUNNER_SCRIPT\" " for ARG in "$@"; do CMD="${CMD} \"$ARG\"" done From c473f9990a7868568cfd4ae13d7e262b7e3bbd0d Mon Sep 17 00:00:00 2001 From: Zaiming Shi Date: Wed, 4 Aug 2021 16:22:59 +0200 Subject: [PATCH 2/5] test: run package test uder emqx user no need to su before start --- .ci/build_packages/tests.sh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.ci/build_packages/tests.sh b/.ci/build_packages/tests.sh index b5ba48f2c..211c5b838 100755 --- a/.ci/build_packages/tests.sh +++ b/.ci/build_packages/tests.sh @@ -73,7 +73,7 @@ emqx_test(){ fi echo "running ${packagename} start" - running_test + run_test echo "running ${packagename} stop" dpkg -r "${EMQX_NAME}" @@ -99,7 +99,7 @@ emqx_test(){ fi echo "running ${packagename} start" - running_test + run_test echo "running ${packagename} stop" rpm -e "${EMQX_NAME}" @@ -113,7 +113,7 @@ emqx_test(){ done } -running_test(){ +run_test(){ # sed -i '/emqx_telemetry/d' /var/lib/emqx/loaded_plugins emqx_env_vars=$(dirname "$(readlink "$(command -v emqx)")")/../releases/emqx_vars @@ -132,7 +132,7 @@ EOF exit 1 fi - if ! su - emqx -c "emqx start"; then + if ! emqx 'start'; then cat /var/log/emqx/erlang.log.1 || true cat /var/log/emqx/emqx.log.1 || true exit 1 @@ -148,8 +148,15 @@ EOF IDLE_TIME=$((IDLE_TIME+1)) done pytest -v /paho-mqtt-testing/interoperability/test_client/V5/test_connect.py::test_basic + export DEBUG=1 # shellcheck disable=SC2009 # pgrep does not support Extended Regular Expressions - emqx stop || kill "$(ps -ef | grep -E '\-progname\s.+emqx\s' |awk '{print $2}')" + ps -ef | grep -E '\-progname\s.+emqx\s' + if ! emqx 'stop'; then + echo "ERROR: failed_to_stop_emqx_with_the_stop_command" + cat /var/log/emqx/erlang.log.1 || true + cat /var/log/emqx/emqx.log.1 || true + exit 1 + fi if [ "$(sed -n '/^ID=/p' /etc/os-release | sed -r 's/ID=(.*)/\1/g' | sed 's/"//g')" = ubuntu ] \ || [ "$(sed -n '/^ID=/p' /etc/os-release | sed -r 's/ID=(.*)/\1/g' | sed 's/"//g')" = debian ] ;then From 9f3063a823cf0867c5cd27e9af2f8043ec3d2ce6 Mon Sep 17 00:00:00 2001 From: Zaiming Shi Date: Wed, 4 Aug 2021 21:33:42 +0200 Subject: [PATCH 3/5] refactor: bin/emqx call fucntions for error logs --- .ci/build_packages/tests.sh | 1 - bin/emqx | 87 ++++++++++++------------------------- 2 files changed, 28 insertions(+), 60 deletions(-) diff --git a/.ci/build_packages/tests.sh b/.ci/build_packages/tests.sh index 211c5b838..3b08f838a 100755 --- a/.ci/build_packages/tests.sh +++ b/.ci/build_packages/tests.sh @@ -148,7 +148,6 @@ EOF IDLE_TIME=$((IDLE_TIME+1)) done pytest -v /paho-mqtt-testing/interoperability/test_client/V5/test_connect.py::test_basic - export DEBUG=1 # shellcheck disable=SC2009 # pgrep does not support Extended Regular Expressions ps -ef | grep -E '\-progname\s.+emqx\s' if ! emqx 'stop'; then diff --git a/bin/emqx b/bin/emqx index af2a7f433..436b94b18 100755 --- a/bin/emqx +++ b/bin/emqx @@ -43,12 +43,21 @@ export LD_LIBRARY_PATH="$ERTS_DIR/lib:$LD_LIBRARY_PATH" export ERTS_LIB_DIR="$ERTS_DIR/../lib" MNESIA_DATA_DIR="$RUNNER_DATA_DIR/mnesia/$NAME" +# Echo to stderr on errors +echoerr() { echo "ERROR: $*" 1>&2; } + die() { - echo >&2 "$1" + echoerr "ERROR: $1" errno=${2:-1} exit "$errno" } +assert_node_alive() { + if ! relx_nodetool "ping" > /dev/null; then + die "node_is_not_running!" 1 + fi +} + relx_usage() { command="$1" @@ -146,9 +155,6 @@ if [ "$ULIMIT_F" -lt 1024 ]; then echo "!!!!" fi -# Echo to stderr on errors -echoerr() { echo "$@" 1>&2; } - SED_REPLACE="sed -i " case $(sed --help 2>&1) in *GNU*) SED_REPLACE="sed -i ";; @@ -205,7 +211,7 @@ call_hocon() { export RUNNER_ETC_DIR export REL_VSN "$ERTS_DIR/bin/escript" "$ROOTDIR/bin/nodetool" hocon "$@" \ - || die "ERROR: call_hocon failed: $*" $? + || die "call_hocon_failed: $*" $? } # Run an escript in the node's environment @@ -282,7 +288,7 @@ generate_config() { # shellcheck disable=SC2086 if ! relx_nodetool chkconfig $CONFIG_ARGS; then - die "Error reading $CONFIG_ARGS" + die "failed_to_check_config $CONFIG_ARGS" fi } @@ -327,7 +333,7 @@ if [ -z "$NAME" ]; then # shellcheck disable=SC2012,SC2086 LATEST_VM_ARGS="$(ls -t $CONFIGS_DIR/vm.*.args | head -1)" if [ -z "$LATEST_VM_ARGS" ]; then - echoerr "For command $1, there is no vm.*.args file found in $CONFIGS_DIR/" + echoerr "no_vm_arg_file_found_for $1 in $CONFIGS_DIR/" exit 1 fi NAME="$(grep -E '^-s?name' "$LATEST_VM_ARGS" | awk '{print $2}')" @@ -360,7 +366,7 @@ if [ -z "$COOKIE" ]; then # shellcheck disable=SC2012,SC2086 LATEST_VM_ARGS="$(ls -t $CONFIGS_DIR/vm.*.args | head -1)" if [ -z "$LATEST_VM_ARGS" ]; then - echo "For command $1, there is no vm.*.args config file found in $CONFIGS_DIR/" + echoerr "no_vm_arg_file_found_for $1 in $CONFIGS_DIR/" exit 1 fi COOKIE="$(grep -E '^-setcookie' "$LATEST_VM_ARGS" | awk '{print $2}')" @@ -385,8 +391,7 @@ case "$1" in start|start_boot) # Make sure a node IS not running if relx_nodetool "ping" >/dev/null 2>&1; then - echo "Node is already running!" - exit 1 + die "node_is_already_running!" fi # Bootstrap daemon command (check perms & drop to $RUNNER_USER) bootstrapd @@ -454,11 +459,14 @@ case "$1" in # Wait for the node to completely stop... PID="$(relx_get_pid)" if ! relx_nodetool "stop"; then + echoerr "emqx_graceful_shutdown_failed PID=[$PID]" exit 1 fi while kill -s 0 "$PID" 2>/dev/null; do sleep 1 done + echoerr "emqx_pid_dangling_after ${max_wait} seconds PID=[$PID]" + exit 1 ;; restart|reboot) @@ -474,10 +482,7 @@ case "$1" in ;; ping) - ## See if the VM is alive - if ! relx_nodetool "ping"; then - exit 1 - fi + assert_node_alive ;; escript) @@ -488,11 +493,7 @@ case "$1" in ;; attach) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive # Bootstrap daemon command (check perms & drop to $RUNNER_USER) bootstrapd @@ -502,11 +503,7 @@ case "$1" in ;; remote_console) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive # Bootstrap daemon command (check perms & drop to $RUNNER_USER) bootstrapd @@ -524,11 +521,7 @@ case "$1" in COMMAND="$1"; shift - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive ERL_FLAGS="$ERL_FLAGS $EPMD_ARG" \ exec "$BINDIR/escript" "$ROOTDIR/bin/install_upgrade.escript" \ @@ -536,11 +529,7 @@ case "$1" in ;; versions) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive COMMAND="$1"; shift @@ -652,54 +641,34 @@ case "$1" in echo "$ERTS_PATH" ;; ctl) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool rpc emqx_ctl run_command "$@" ;; rpc) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool rpc "$@" ;; rpcterms) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool rpcterms "$@" ;; root_dir) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool "eval" 'code:root_dir()' ;; eval) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool "eval" "$@" From 668ecbe97bf8554a3164f59101228edbb06b8d6c Mon Sep 17 00:00:00 2001 From: Zaiming Shi Date: Wed, 4 Aug 2021 13:05:50 +0200 Subject: [PATCH 4/5] fix: improve node stop wait loop --- .../src/emqx_machine_terminator.erl | 21 ++----- bin/emqx | 63 ++++++++++++------- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/apps/emqx_machine/src/emqx_machine_terminator.erl b/apps/emqx_machine/src/emqx_machine_terminator.erl index 524cf316d..a1f66deb6 100644 --- a/apps/emqx_machine/src/emqx_machine_terminator.erl +++ b/apps/emqx_machine/src/emqx_machine_terminator.erl @@ -41,7 +41,7 @@ start_link() -> is_running() -> is_pid(whereis(?TERMINATOR)). -%% @doc Send a signal to activate the terminator. +%% @doc Call terminator to stop applications, then issue init:stop() to terminat the VM. graceful() -> try _ = gen_server:call(?TERMINATOR, ?DO_IT, infinity) @@ -52,28 +52,19 @@ graceful() -> %% should issue a shutdown to be sure %% NOTE: not exit_loop here because we do not want to %% block erl_signal_server + ?ELOG("Shutdown before node is ready?~n", []), init:stop() end, ok. -%% @doc Shutdown the Erlang VM and wait until the terminator dies or the VM dies. +%% @doc Shutdown the Erlang VM and wait indefinitely. graceful_wait() -> - case whereis(?TERMINATOR) of - undefined -> - ?SLOG(warning, #{msg => "shutdown_before_boot_is_complete"}), - exit_loop(); - Pid -> - ok = graceful(), - Ref = monitor(process, Pid), - %% NOTE: not exactly sure, but maybe there is a chance that - %% Erlang VM goes down before this receive. - %% In which case, the remote caller will get {badrpc, nodedown} - receive {'DOWN', Ref, process, Pid, _} -> ok end - end. + ok = graceful(), + exit_loop(). exit_loop() -> - init:stop(), timer:sleep(100), + init:stop(), exit_loop(). init(_) -> diff --git a/bin/emqx b/bin/emqx index 436b94b18..c582f8fbf 100755 --- a/bin/emqx +++ b/bin/emqx @@ -299,6 +299,33 @@ bootstrapd() { fi } +# check if a PID is down +is_down() { + PID="$1" + if kill -s 0 "$PID" 2>/dev/null; then + return 1 + fi + return 0 +} + +wait_for() { + local WAIT_TIME + local CMD + WAIT_TIME="$1" + shift + CMD="$*" + while true; do + if $CMD >/dev/null 2>&1; then + return 0 + fi + if [ "$WAIT_TIME" -le 0 ]; then + return 1 + fi + WAIT_TIME=$((WAIT_TIME - 1)) + sleep 1 + done +} + # Use $CWD/etc/sys.config if exists if [ -z "$RELX_CONFIG_PATH" ]; then if [ -f "$RUNNER_ETC_DIR/sys.config" ]; then @@ -437,22 +464,16 @@ case "$1" in "$(relx_start_command)" WAIT_TIME=${WAIT_FOR_ERLANG:-15} - while [ "$WAIT_TIME" -gt 0 ]; do - if ! relx_nodetool "ping" >/dev/null 2>&1; then - WAIT_TIME=$((WAIT_TIME - 1)) - sleep 1 - continue - fi - sleep 1 - if relx_nodetool "ping" >/dev/null 2>&1; then - echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" - exit 0 - fi - done && echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_FOR_ERLANG:-15} seconds," - echo "see the output of '$0 console' for more information." - echo "If you want to wait longer, set the environment variable" - echo "WAIT_FOR_ERLANG to the number of seconds to wait." - exit 1 + if wait_for "$WAIT_TIME" 'relx_nodetool' 'ping'; then + echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" + exit 0 + else + echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_TIME} seconds," + echo "see the output of '$0 console' for more information." + echo "If you want to wait longer, set the environment variable" + echo "WAIT_FOR_ERLANG to the number of seconds to wait." + exit 1 + fi ;; stop) @@ -462,11 +483,11 @@ case "$1" in echoerr "emqx_graceful_shutdown_failed PID=[$PID]" exit 1 fi - while kill -s 0 "$PID" 2>/dev/null; do - sleep 1 - done - echoerr "emqx_pid_dangling_after ${max_wait} seconds PID=[$PID]" - exit 1 + WAIT_TIME=30 + if ! wait_for "$WAIT_TIME" is_down "$PID"; then + echoerr "emqx_pid_dangling_after ${WAIT_TIME} seconds PID=[$PID]" + exit 1 + fi ;; restart|reboot) From f1262e04bfadd17888b6854312f97139915606ed Mon Sep 17 00:00:00 2001 From: Zaiming Shi Date: Sun, 8 Aug 2021 11:23:01 +0200 Subject: [PATCH 5/5] feat: log error if timed out waiting after init:stop --- apps/emqx_machine/src/emqx_machine_terminator.erl | 3 ++- bin/emqx | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/apps/emqx_machine/src/emqx_machine_terminator.erl b/apps/emqx_machine/src/emqx_machine_terminator.erl index a1f66deb6..74479a6d9 100644 --- a/apps/emqx_machine/src/emqx_machine_terminator.erl +++ b/apps/emqx_machine/src/emqx_machine_terminator.erl @@ -41,7 +41,8 @@ start_link() -> is_running() -> is_pid(whereis(?TERMINATOR)). -%% @doc Call terminator to stop applications, then issue init:stop() to terminat the VM. +%% @doc Call `emqx_machine_terminator' to stop applications +%% then call init:stop() stop beam. graceful() -> try _ = gen_server:call(?TERMINATOR, ?DO_IT, infinity) diff --git a/bin/emqx b/bin/emqx index c582f8fbf..6b6a6504c 100755 --- a/bin/emqx +++ b/bin/emqx @@ -480,14 +480,19 @@ case "$1" in # Wait for the node to completely stop... PID="$(relx_get_pid)" if ! relx_nodetool "stop"; then - echoerr "emqx_graceful_shutdown_failed PID=[$PID]" + echoerr "Graceful shutdown failed PID=[$PID]" exit 1 fi - WAIT_TIME=30 + WAIT_TIME="${WAIT_FOR_ERLANG_STOP:-60}" if ! wait_for "$WAIT_TIME" is_down "$PID"; then - echoerr "emqx_pid_dangling_after ${WAIT_TIME} seconds PID=[$PID]" + msg="dangling after ${WAIT_TIME} seconds" + # also log to syslog + logger -t "${REL_NAME}[${PID}]" "STOP: $msg" + # log to user console + echoerr "STOP: $msg" exit 1 fi + logger -t "${REL_NAME}[${PID}]" "STOP: OK" ;; restart|reboot) @@ -612,7 +617,7 @@ case "$1" in $RELX_CONFIG_PATH $CONFIG_ARGS $EPMD_ARG # Log the startup - logger -t "${REL_NAME}[$$]" "$* -- ${1+$ARGS}" + logger -t "${REL_NAME}[$$]" "EXEC: $* -- ${1+$ARGS}" # Start the VM exec "$@" -- ${1+$ARGS} @@ -653,7 +658,7 @@ case "$1" in $RELX_CONFIG_PATH $CONFIG_ARGS $EPMD_ARG # Log the startup - logger -t "${REL_NAME}[$$]" "$* -- ${1+$ARGS}" + logger -t "${REL_NAME}[$$]" "EXEC: $* -- ${1+$ARGS}" # Start the VM exec "$@" -- ${1+$ARGS}