refactor: read node name and cookie from ps -ef

instead of parsing the generated vm.args file, because the file might have been deleted
2023-02-12 11:32:06 +01:00 · 2023-02-12 11:32:06 +01:00 · d08eb01d90
parent b6b9df06db
commit d08eb01d90
1 changed files with 26 additions and 46 deletions
--- a/bin/emqx
+++ b/bin/emqx
@ -299,6 +299,8 @@ fi
 # Make sure log directory exists
 mkdir -p "$RUNNER_LOG_DIR"
 # turn off debug as this is static
 set +x
 COMPATIBILITY_CHECK='
    io:format("BEAM_OK~n", []),
    try
@ -321,14 +323,15 @@ COMPATIBILITY_CHECK='
    end,
    halt(0).
 '
 [ "$DEBUG" -eq 1 ] && set -x
 compatiblity_info() {
  # RELEASE_LIB is used by Elixir
  # set crash-dump bytes to zero to ensure no crash dump is generated when erl crashes
  env ERL_CRASH_DUMP_BYTES=0 "$BINDIR/$PROGNAME" \
    -noshell \
    -boot_var RELEASE_LIB "$ERTS_LIB_DIR/lib" \
    -boot "$REL_DIR/start_clean" \
    -boot_var RELEASE_LIB "$ERTS_LIB_DIR/lib" \
    -eval "$COMPATIBILITY_CHECK"
 }
@ -464,6 +467,8 @@ if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
        export EMQX_BOOT_CONFIGS
    fi
 else
    # Turn off debug as the ps output can be quite noisy
    set +x
    # For non-boot commands, we need below runtime facts to connect to the running node:
    #  1. The running node name.
    #  2. The Erlang cookie in use by the running node name.
@ -481,31 +486,33 @@ else
    #    then update the config in the file to 'node.name = "emqx@local.net"', after this change,
    #    there would be no way stop the running node 'emqx@127.0.0.1', because 'emqx stop' command
    #    would try to stop the new node instead.
    #  * The node name and Erlang cookie can be found in 'ps -ef' output, but they are parsed from generated config instead.
    #  * The primary grep pattern is $RUNNER_ROOT_DIR because one can start multiple nodes at the same time
    #  * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
-    #  * The running 'remsh' and 'escript' processes must be excluded
+    #  * The running 'remsh' and 'nodetool' processes must be excluded
    # shellcheck disable=SC2009
    PS_LINE="$(ps -ef | grep '[e]mqx' | grep -v -E '(remsh|nodetool)' | grep -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
    [ "$DEBUG" -eq 1 ] && echo "EMQX processes: $PS_LINE"
    if [ "$(echo -e "$PS_LINE" | wc -l)" -eq 1 ]; then
-        ## only one emqx node is running
+        ## only one emqx node is running, get running args from 'ps -ef' output
-        ## strip 'emqx_data_dir ' and ' --' because the dir in between may contain spaces
+        tmp_nodename=$(echo -e "$PS_LINE" | grep -oE "\s\-s?name.*" | awk '{print $2}' || true)
-        DATA_DIR="$(echo -e "$PS_LINE" | grep -oE "\-emqx_data_dir.*" | sed -E 's#.+emqx_data_dir[[:blank:]]##g' | sed -E 's#[[:blank:]]--$##g' || true)"
+        tmp_cookie=$(echo -e "$PS_LINE" | grep -oE "\s\-setcookie.*" | awk '{print $2}' || true)
-        if [ "$DATA_DIR" = '' ]; then
+        tmp_dist="$(echo -e "$PS_LINE" | grep -oE '\-ssl_dist_optfile\s.+\s' | awk '{print $2}' || true)"
-            ## this should not happen unless -emqx_data_dir is not set
+        # data_dir is actually not needed, but kept anyway
-            die "node_is_not_running!" 1
+        tmp_daadir="$(echo -e "$PS_LINE" | grep -oE "\-emqx_data_dir.*" | sed -E 's#.+emqx_data_dir[[:blank:]]##g' | sed -E 's#[[:blank:]]--$##g' || true)"
-        fi
+        if [ -z "$tmp_dist" ]; then
-        # get ssl_dist_optfile option
+            tmp_proto='inet_tcp'
        SSL_DIST_OPTFILE="$(echo -e "$PS_LINE" | grep -oE '\-ssl_dist_optfile\s.+\s' | awk '{print $2}' || true)"
        if [ -z "$SSL_DIST_OPTFILE" ]; then
            EMQX_BOOT_CONFIGS="node.data_dir=${DATA_DIR}\ncluster.proto_dist=inet_tcp"
        else
-            EMQX_BOOT_CONFIGS="node.data_dir=${DATA_DIR}\ncluster.proto_dist=inet_tls"
+            tmp_proto='inet_tls'
        fi
        ## Make the format like what call_hocon multi_get prints out, but only need 4 args
        EMQX_BOOT_CONFIGS="node.name=${tmp_nodename}\nnode.cookie=${tmp_cookie}\ncluster.proto_dist=${tmp_proto}\nnode.data_dir=${tmp_daadir}"
        [ "$DEBUG" -eq 1 ] && echo "EMQX boot-configs: $EMQX_BOOT_CONFIGS"
    else
        ## None or more than one node is running, resolve from boot config
        ## we have no choiece but to read the bootstrap config (with environment overrides available in the current shell)
        EMQX_BOOT_CONFIGS="$(call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf multi_get "${CONF_KEYS[@]}")"
    fi
    [ "$DEBUG" -eq 1 ] && set -x
 fi
 get_boot_config() {
@ -812,23 +819,6 @@ wait_until_return_val() {
    done
 }
 latest_vm_args() {
    local hint_var_name="$1"
    local vm_args_file
    vm_args_file="$(find "$CONFIGS_DIR" -type f -name "vm.*.args" | sort | tail -1)"
    if [ -f "$vm_args_file" ]; then
        echo "$vm_args_file"
    else
        set +x
        logerr "Node not initialized?"
        logerr "Generated config file vm.*.args is not found for command '$COMMAND'"
        logerr "in config dir: $CONFIGS_DIR"
        logerr "In case the file has been deleted while the node is running,"
        logerr "set environment variable '$hint_var_name' to continue"
        exit 1
    fi
 }
 # backward compatible with 4.x
 tr_log_to_env() {
    local log_to=${EMQX_LOG__TO:-undefined}
@ -871,6 +861,7 @@ maybe_log_to_console() {
    fi
 }
 ## To be backward compatible, read and then unset EMQX_NODE_NAME
 if [ -n "${EMQX_NODE_NAME:-}" ]; then
    export EMQX_NODE__NAME="${EMQX_NODE_NAME}"
    unset EMQX_NODE_NAME
@ -882,13 +873,7 @@ fi
 ## or long name (with '@') e.g. 'emqx@example.net' or 'emqx@127.0.0.1'
 NAME="${EMQX_NODE__NAME:-}"
 if [ -z "$NAME" ]; then
-    if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
+    NAME="$(get_boot_config 'node.name')"
        # for boot commands, inspect emqx.conf for node name
        NAME="$(get_boot_config 'node.name')"
    else
        vm_args_file="$(latest_vm_args 'EMQX_NODE__NAME')"
        NAME="$(grep -E '^-s?name' "${vm_args_file}" | awk '{print $2}')"
    fi
 fi
 # force to use 'emqx' short name
@ -914,18 +899,13 @@ PIPE_DIR="${PIPE_DIR:-/$DATA_DIR/${WHOAMI}_erl_pipes/$NAME/}"
 ## Resolve Erlang cookie.
 if [ -n "${EMQX_NODE_COOKIE:-}" ]; then
-    ## To be backward compatible, read EMQX_NODE_COOKIE
+    ## To be backward compatible, read and unset EMQX_NODE_COOKIE
    export EMQX_NODE__COOKIE="${EMQX_NODE_COOKIE}"
    unset EMQX_NODE_COOKIE
 fi
 COOKIE="${EMQX_NODE__COOKIE:-}"
 if [ -z "$COOKIE" ]; then
-    if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
+    COOKIE="$(get_boot_config 'node.cookie')"
        COOKIE="$(get_boot_config 'node.cookie')"
    else
        vm_args_file="$(latest_vm_args 'EMQX_NODE__COOKIE')"
        COOKIE="$(grep -E '^-setcookie' "${vm_args_file}" | awk '{print $2}')"
    fi
 fi
 [ -z "$COOKIE" ] && COOKIE="$EMQX_DEFAULT_ERLANG_COOKIE"
 if [ $IS_BOOT_COMMAND = 'yes' ] && [ "$COOKIE" = "$EMQX_DEFAULT_ERLANG_COOKIE" ]; then