From 6b81d9965f715c890868dc4d62289e0ffc471163 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 16 Feb 2023 09:59:20 +0100 Subject: [PATCH] fix(bin/emqx): allow starting two nodes from the same installation If more than one node is boot from the same root directory try to find the node by node name set in EMQX_NODE_NAME or EMQX_NODE__NAME environment variable --- bin/emqx | 64 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/bin/emqx b/bin/emqx index 6f953b777..9211bd338 100755 --- a/bin/emqx +++ b/bin/emqx @@ -76,6 +76,12 @@ logwarn() { fi } +logdebug() { + if [ "$DEBUG" -eq 1 ]; then + echo "DEBUG: $*" + fi +} + die() { set +x logerr "$1" @@ -453,24 +459,37 @@ if [ "$IS_ENTERPRISE" = 'yes' ]; then CONF_KEYS+=( 'license.key' ) fi - -## Find the running node from 'ps -ef' -## The primary grep pattern is $RUNNER_ROOT_DIR because one can start multiple nodes at the same time -# shellcheck disable=SC2009 -PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)" -if [ -n "${PS_LINE}" ]; then - RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | wc -l)" -else - RUNNING_NODES_COUNT=0 +## To be backward compatible, read and then unset EMQX_NODE_NAME +if [ -n "${EMQX_NODE_NAME:-}" ]; then + export EMQX_NODE__NAME="${EMQX_NODE_NAME}" + unset EMQX_NODE_NAME fi # Turn off debug as the ps output can be quite noisy set +x + +## Find the running node from 'ps -ef' +## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself +## * The running 'remsh' and 'nodetool' processes must be excluded +if [ -n "${EMQX_NODE__NAME:-}" ]; then + # if node name is provided, filter by node name + # shellcheck disable=SC2009 + PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)" +else + # shellcheck disable=SC2009 + PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)" +fi +logdebug "PS_LINE=$PS_LINE" +RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)" +[ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT" + if [ "$IS_BOOT_COMMAND" = 'yes' ]; then if [ "$RUNNING_NODES_COUNT" -gt 0 ] && [ "$COMMAND" != 'check_config' ]; then - tmp_nodename=$(echo -e "$PS_LINE" | $GREP -oE "\s\-s?name.*" | awk '{print $2}' || true) - echo "Node ${tmp_nodename} is already running!" - exit 1 + running_node_name=$(echo -e "$PS_LINE" | $GREP -oE "\s\-s?name.*" | awk '{print $2}' || true) + if [ -n "$running_node_name" ] && [ "$running_node_name" = "${EMQX_NODE__NAME:-}" ]; then + echo "Node ${running_node_name} is already running!" + exit 1 + fi fi [ -f "$EMQX_ETC_DIR"/emqx.conf ] || die "emqx.conf is not found in $EMQX_ETC_DIR" 1 maybe_use_portable_dynlibs @@ -502,8 +521,6 @@ else # then update the config in the file to 'node.name = "emqx@local.net"', after this change, # there would be no way stop the running node 'emqx@127.0.0.1', because 'emqx stop' command # would try to stop the new node instead. - # * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself - # * The running 'remsh' and 'nodetool' processes must be excluded if [ "$RUNNING_NODES_COUNT" -eq 1 ]; then ## only one emqx node is running, get running args from 'ps -ef' output tmp_nodename=$(echo -e "$PS_LINE" | $GREP -oE "\s\-s?name.*" | awk '{print $2}' || true) @@ -520,14 +537,22 @@ else ## Make the format like what call_hocon multi_get prints out, but only need 4 args EMQX_BOOT_CONFIGS="node.name=${tmp_nodename}\nnode.cookie=${tmp_cookie}\ncluster.proto_dist=${tmp_proto}\nnode.dist_net_ticktime=$tmp_ticktime\nnode.data_dir=${tmp_datadir}" else - ## None or more than one node is running, resolve from boot config - ## we have no choiece but to read the bootstrap config (with environment overrides available in the current shell) + if [ "$RUNNING_NODES_COUNT" -gt 1 ]; then + if [ -z "${EMQX_NODE__NAME:-}" ]; then + tmp_nodenames=$(echo -e "$PS_LINE" | $GREP -oE "\s\-s?name.*" | awk '{print $2}' | tr '\n' ' ') + logerr "More than one EMQX node found running (root dir: ${RUNNER_ROOT_DIR})" + logerr "Running nodes: $tmp_nodenames" + logerr "Make sure environment variable EMQX_NODE__NAME is set to indicate for which node this command is intended." + exit 1 + fi + fi + ## We have no choiece but to read the bootstrap config (with environment overrides available in the current shell) [ -f "$EMQX_ETC_DIR"/emqx.conf ] || die "emqx.conf is not found in $EMQX_ETC_DIR" 1 maybe_use_portable_dynlibs EMQX_BOOT_CONFIGS="$(call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf multi_get "${CONF_KEYS[@]}")" fi fi -[ "$DEBUG" -eq 1 ] && echo "EMQX_BOOT_CONFIGS: $EMQX_BOOT_CONFIGS" +logdebug "EMQX_BOOT_CONFIGS: $EMQX_BOOT_CONFIGS" [ "$DEBUG" -eq 1 ] && set -x get_boot_config() { @@ -877,11 +902,6 @@ maybe_log_to_console() { fi } -## To be backward compatible, read and then unset EMQX_NODE_NAME -if [ -n "${EMQX_NODE_NAME:-}" ]; then - export EMQX_NODE__NAME="${EMQX_NODE_NAME}" - unset EMQX_NODE_NAME -fi ## Possible ways to configure emqx node name: ## 1. configure node.name in emqx.conf ## 2. override with environment variable EMQX_NODE__NAME