diff --git a/.ci/build_packages/tests.sh b/.ci/build_packages/tests.sh index b5ba48f2c..3b08f838a 100755 --- a/.ci/build_packages/tests.sh +++ b/.ci/build_packages/tests.sh @@ -73,7 +73,7 @@ emqx_test(){ fi echo "running ${packagename} start" - running_test + run_test echo "running ${packagename} stop" dpkg -r "${EMQX_NAME}" @@ -99,7 +99,7 @@ emqx_test(){ fi echo "running ${packagename} start" - running_test + run_test echo "running ${packagename} stop" rpm -e "${EMQX_NAME}" @@ -113,7 +113,7 @@ emqx_test(){ done } -running_test(){ +run_test(){ # sed -i '/emqx_telemetry/d' /var/lib/emqx/loaded_plugins emqx_env_vars=$(dirname "$(readlink "$(command -v emqx)")")/../releases/emqx_vars @@ -132,7 +132,7 @@ EOF exit 1 fi - if ! su - emqx -c "emqx start"; then + if ! emqx 'start'; then cat /var/log/emqx/erlang.log.1 || true cat /var/log/emqx/emqx.log.1 || true exit 1 @@ -149,7 +149,13 @@ EOF done pytest -v /paho-mqtt-testing/interoperability/test_client/V5/test_connect.py::test_basic # shellcheck disable=SC2009 # pgrep does not support Extended Regular Expressions - emqx stop || kill "$(ps -ef | grep -E '\-progname\s.+emqx\s' |awk '{print $2}')" + ps -ef | grep -E '\-progname\s.+emqx\s' + if ! emqx 'stop'; then + echo "ERROR: failed_to_stop_emqx_with_the_stop_command" + cat /var/log/emqx/erlang.log.1 || true + cat /var/log/emqx/emqx.log.1 || true + exit 1 + fi if [ "$(sed -n '/^ID=/p' /etc/os-release | sed -r 's/ID=(.*)/\1/g' | sed 's/"//g')" = ubuntu ] \ || [ "$(sed -n '/^ID=/p' /etc/os-release | sed -r 's/ID=(.*)/\1/g' | sed 's/"//g')" = debian ] ;then diff --git a/apps/emqx_machine/src/emqx_machine_terminator.erl b/apps/emqx_machine/src/emqx_machine_terminator.erl index 524cf316d..74479a6d9 100644 --- a/apps/emqx_machine/src/emqx_machine_terminator.erl +++ b/apps/emqx_machine/src/emqx_machine_terminator.erl @@ -41,7 +41,8 @@ start_link() -> is_running() -> is_pid(whereis(?TERMINATOR)). -%% @doc Send a signal to activate the terminator. +%% @doc Call `emqx_machine_terminator' to stop applications +%% then call init:stop() stop beam. graceful() -> try _ = gen_server:call(?TERMINATOR, ?DO_IT, infinity) @@ -52,28 +53,19 @@ graceful() -> %% should issue a shutdown to be sure %% NOTE: not exit_loop here because we do not want to %% block erl_signal_server + ?ELOG("Shutdown before node is ready?~n", []), init:stop() end, ok. -%% @doc Shutdown the Erlang VM and wait until the terminator dies or the VM dies. +%% @doc Shutdown the Erlang VM and wait indefinitely. graceful_wait() -> - case whereis(?TERMINATOR) of - undefined -> - ?SLOG(warning, #{msg => "shutdown_before_boot_is_complete"}), - exit_loop(); - Pid -> - ok = graceful(), - Ref = monitor(process, Pid), - %% NOTE: not exactly sure, but maybe there is a chance that - %% Erlang VM goes down before this receive. - %% In which case, the remote caller will get {badrpc, nodedown} - receive {'DOWN', Ref, process, Pid, _} -> ok end - end. + ok = graceful(), + exit_loop(). exit_loop() -> - init:stop(), timer:sleep(100), + init:stop(), exit_loop(). init(_) -> diff --git a/bin/emqx b/bin/emqx index 0afa81bc5..6b6a6504c 100755 --- a/bin/emqx +++ b/bin/emqx @@ -5,7 +5,8 @@ set -e set -o pipefail -if [ -n "$DEBUG" ]; then +DEBUG="${DEBUG:-0}" +if [ "$DEBUG" -eq 1 ]; then set -x fi @@ -42,12 +43,21 @@ export LD_LIBRARY_PATH="$ERTS_DIR/lib:$LD_LIBRARY_PATH" export ERTS_LIB_DIR="$ERTS_DIR/../lib" MNESIA_DATA_DIR="$RUNNER_DATA_DIR/mnesia/$NAME" +# Echo to stderr on errors +echoerr() { echo "ERROR: $*" 1>&2; } + die() { - echo >&2 "$1" + echoerr "ERROR: $1" errno=${2:-1} exit "$errno" } +assert_node_alive() { + if ! relx_nodetool "ping" > /dev/null; then + die "node_is_not_running!" 1 + fi +} + relx_usage() { command="$1" @@ -113,7 +123,7 @@ check_user() { echo "You need to be root or use sudo to run this command" exit 1 fi - CMD="\"$RUNNER_SCRIPT\" " + CMD="DEBUG=$DEBUG \"$RUNNER_SCRIPT\" " for ARG in "$@"; do CMD="${CMD} \"$ARG\"" done @@ -145,9 +155,6 @@ if [ "$ULIMIT_F" -lt 1024 ]; then echo "!!!!" fi -# Echo to stderr on errors -echoerr() { echo "$@" 1>&2; } - SED_REPLACE="sed -i " case $(sed --help 2>&1) in *GNU*) SED_REPLACE="sed -i ";; @@ -204,7 +211,7 @@ call_hocon() { export RUNNER_ETC_DIR export REL_VSN "$ERTS_DIR/bin/escript" "$ROOTDIR/bin/nodetool" hocon "$@" \ - || die "ERROR: call_hocon failed: $*" $? + || die "call_hocon_failed: $*" $? } # Run an escript in the node's environment @@ -281,7 +288,7 @@ generate_config() { # shellcheck disable=SC2086 if ! relx_nodetool chkconfig $CONFIG_ARGS; then - die "Error reading $CONFIG_ARGS" + die "failed_to_check_config $CONFIG_ARGS" fi } @@ -292,6 +299,33 @@ bootstrapd() { fi } +# check if a PID is down +is_down() { + PID="$1" + if kill -s 0 "$PID" 2>/dev/null; then + return 1 + fi + return 0 +} + +wait_for() { + local WAIT_TIME + local CMD + WAIT_TIME="$1" + shift + CMD="$*" + while true; do + if $CMD >/dev/null 2>&1; then + return 0 + fi + if [ "$WAIT_TIME" -le 0 ]; then + return 1 + fi + WAIT_TIME=$((WAIT_TIME - 1)) + sleep 1 + done +} + # Use $CWD/etc/sys.config if exists if [ -z "$RELX_CONFIG_PATH" ]; then if [ -f "$RUNNER_ETC_DIR/sys.config" ]; then @@ -326,7 +360,7 @@ if [ -z "$NAME" ]; then # shellcheck disable=SC2012,SC2086 LATEST_VM_ARGS="$(ls -t $CONFIGS_DIR/vm.*.args | head -1)" if [ -z "$LATEST_VM_ARGS" ]; then - echoerr "For command $1, there is no vm.*.args file found in $CONFIGS_DIR/" + echoerr "no_vm_arg_file_found_for $1 in $CONFIGS_DIR/" exit 1 fi NAME="$(grep -E '^-s?name' "$LATEST_VM_ARGS" | awk '{print $2}')" @@ -359,7 +393,7 @@ if [ -z "$COOKIE" ]; then # shellcheck disable=SC2012,SC2086 LATEST_VM_ARGS="$(ls -t $CONFIGS_DIR/vm.*.args | head -1)" if [ -z "$LATEST_VM_ARGS" ]; then - echo "For command $1, there is no vm.*.args config file found in $CONFIGS_DIR/" + echoerr "no_vm_arg_file_found_for $1 in $CONFIGS_DIR/" exit 1 fi COOKIE="$(grep -E '^-setcookie' "$LATEST_VM_ARGS" | awk '{print $2}')" @@ -384,8 +418,7 @@ case "$1" in start|start_boot) # Make sure a node IS not running if relx_nodetool "ping" >/dev/null 2>&1; then - echo "Node is already running!" - exit 1 + die "node_is_already_running!" fi # Bootstrap daemon command (check perms & drop to $RUNNER_USER) bootstrapd @@ -431,33 +464,35 @@ case "$1" in "$(relx_start_command)" WAIT_TIME=${WAIT_FOR_ERLANG:-15} - while [ "$WAIT_TIME" -gt 0 ]; do - if ! relx_nodetool "ping" >/dev/null 2>&1; then - WAIT_TIME=$((WAIT_TIME - 1)) - sleep 1 - continue - fi - sleep 1 - if relx_nodetool "ping" >/dev/null 2>&1; then - echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" - exit 0 - fi - done && echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_FOR_ERLANG:-15} seconds," - echo "see the output of '$0 console' for more information." - echo "If you want to wait longer, set the environment variable" - echo "WAIT_FOR_ERLANG to the number of seconds to wait." - exit 1 + if wait_for "$WAIT_TIME" 'relx_nodetool' 'ping'; then + echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!" + exit 0 + else + echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_TIME} seconds," + echo "see the output of '$0 console' for more information." + echo "If you want to wait longer, set the environment variable" + echo "WAIT_FOR_ERLANG to the number of seconds to wait." + exit 1 + fi ;; stop) # Wait for the node to completely stop... PID="$(relx_get_pid)" if ! relx_nodetool "stop"; then + echoerr "Graceful shutdown failed PID=[$PID]" exit 1 fi - while kill -s 0 "$PID" 2>/dev/null; do - sleep 1 - done + WAIT_TIME="${WAIT_FOR_ERLANG_STOP:-60}" + if ! wait_for "$WAIT_TIME" is_down "$PID"; then + msg="dangling after ${WAIT_TIME} seconds" + # also log to syslog + logger -t "${REL_NAME}[${PID}]" "STOP: $msg" + # log to user console + echoerr "STOP: $msg" + exit 1 + fi + logger -t "${REL_NAME}[${PID}]" "STOP: OK" ;; restart|reboot) @@ -473,10 +508,7 @@ case "$1" in ;; ping) - ## See if the VM is alive - if ! relx_nodetool "ping"; then - exit 1 - fi + assert_node_alive ;; escript) @@ -487,11 +519,7 @@ case "$1" in ;; attach) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive # Bootstrap daemon command (check perms & drop to $RUNNER_USER) bootstrapd @@ -501,11 +529,7 @@ case "$1" in ;; remote_console) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive # Bootstrap daemon command (check perms & drop to $RUNNER_USER) bootstrapd @@ -523,11 +547,7 @@ case "$1" in COMMAND="$1"; shift - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive ERL_FLAGS="$ERL_FLAGS $EPMD_ARG" \ exec "$BINDIR/escript" "$ROOTDIR/bin/install_upgrade.escript" \ @@ -535,11 +555,7 @@ case "$1" in ;; versions) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive COMMAND="$1"; shift @@ -601,7 +617,7 @@ case "$1" in $RELX_CONFIG_PATH $CONFIG_ARGS $EPMD_ARG # Log the startup - logger -t "${REL_NAME}[$$]" "$* -- ${1+$ARGS}" + logger -t "${REL_NAME}[$$]" "EXEC: $* -- ${1+$ARGS}" # Start the VM exec "$@" -- ${1+$ARGS} @@ -642,7 +658,7 @@ case "$1" in $RELX_CONFIG_PATH $CONFIG_ARGS $EPMD_ARG # Log the startup - logger -t "${REL_NAME}[$$]" "$* -- ${1+$ARGS}" + logger -t "${REL_NAME}[$$]" "EXEC: $* -- ${1+$ARGS}" # Start the VM exec "$@" -- ${1+$ARGS} @@ -651,54 +667,34 @@ case "$1" in echo "$ERTS_PATH" ;; ctl) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool rpc emqx_ctl run_command "$@" ;; rpc) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool rpc "$@" ;; rpcterms) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool rpcterms "$@" ;; root_dir) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool "eval" 'code:root_dir()' ;; eval) - # Make sure a node IS running - if ! relx_nodetool "ping" > /dev/null; then - echo "Node is not running!" - exit 1 - fi + assert_node_alive shift relx_nodetool "eval" "$@"