Merge pull request #9961 from zmstone/0211-fix-remsh

0211 fix remsh
This commit is contained in:
Zaiming (Stone) Shi 2023-02-13 18:20:43 +01:00 committed by GitHub
commit 43aab61a3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 65 deletions

131
bin/emqx
View File

@ -85,7 +85,7 @@ die() {
assert_node_alive() {
if ! relx_nodetool "ping" > /dev/null; then
die "node_is_not_running!" 1
exit 1
fi
}
@ -230,6 +230,7 @@ usage() {
}
COMMAND="${1:-}"
GREP='grep --color=never'
if [ -z "$COMMAND" ]; then
usage 'help'
@ -299,6 +300,8 @@ fi
# Make sure log directory exists
mkdir -p "$RUNNER_LOG_DIR"
# turn off debug as this is static
set +x
COMPATIBILITY_CHECK='
io:format("BEAM_OK~n", []),
try
@ -321,50 +324,47 @@ COMPATIBILITY_CHECK='
end,
halt(0).
'
[ "$DEBUG" -eq 1 ] && set -x
compatiblity_info() {
# RELEASE_LIB is used by Elixir
# set crash-dump bytes to zero to ensure no crash dump is generated when erl crashes
env ERL_CRASH_DUMP_BYTES=0 "$BINDIR/$PROGNAME" \
-noshell \
-boot_var RELEASE_LIB "$ERTS_LIB_DIR/lib" \
-boot "$REL_DIR/start_clean" \
-boot_var RELEASE_LIB "$ERTS_LIB_DIR/lib" \
-eval "$COMPATIBILITY_CHECK"
}
# Collect Erlang/OTP runtime sanity and compatibility in one go
if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
maybe_use_portable_dynlibs() {
# Read BUILD_INFO early as the next commands may mess up the shell
BUILD_INFO="$(cat "${REL_DIR}/BUILD_INFO")"
COMPATIBILITY_INFO="$(compatiblity_info 2>/dev/null || true)"
if ! (echo -e "$COMPATIBILITY_INFO" | grep -q 'CRYPTO_OK'); then
if ! (echo -e "$COMPATIBILITY_INFO" | $GREP -q 'CRYPTO_OK'); then
## failed to start, might be due to missing libs, try to be portable
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-$DYNLIBS_DIR}"
if [ "$LD_LIBRARY_PATH" != "$DYNLIBS_DIR" ]; then
export LD_LIBRARY_PATH="$DYNLIBS_DIR:$LD_LIBRARY_PATH"
fi
## Turn off debug, because COMPATIBILITY_INFO needs to capture stderr
set +x
COMPATIBILITY_INFO="$(compatiblity_info 2>&1 || true)"
if ! (echo -e "$COMPATIBILITY_INFO" | grep -q 'BEAM_OK'); then
if ! (echo -e "$COMPATIBILITY_INFO" | $GREP -q 'BEAM_OK'); then
## not able to start beam.smp
set +x
logerr "$COMPATIBILITY_INFO"
logerr "Please ensure it is running on the correct platform:"
logerr "$BUILD_INFO"
logerr "Version=$REL_VSN"
logerr "Required dependencies: openssl-1.1.1 (libcrypto), libncurses and libatomic1"
exit 1
elif ! (echo -e "$COMPATIBILITY_INFO" | grep -q 'CRYPTO_OK'); then
elif ! (echo -e "$COMPATIBILITY_INFO" | $GREP -q 'CRYPTO_OK'); then
## not able to start crypto app
set +x
logerr "$COMPATIBILITY_INFO"
exit 2
fi
logwarn "Using libs from '${DYNLIBS_DIR}' due to missing from the OS."
fi
[ "$DEBUG" -eq 1 ] && set -x
fi
}
# Warn the user if ulimit -n is less than 1024
ULIMIT_F=$(ulimit -n)
@ -397,8 +397,6 @@ remsh() {
# Generate a unique id used to allow multiple remsh to the same node
# transparently
id="remsh$(relx_gen_id)-${NAME}"
# Get the node's ticktime so that we use the same thing.
TICKTIME="$(relx_nodetool rpcterms net_kernel get_net_ticktime)"
# shellcheck disable=SC2086
# Setup remote shell command to control node
@ -450,13 +448,16 @@ call_hocon() {
## Resolve boot configs in a batch
## This is because starting the Erlang beam with all modules loaded
## and parsing HOCON config + environment variables is a non-trivial task
CONF_KEYS=( 'node.data_dir' 'node.name' 'node.cookie' 'node.db_backend' 'cluster.proto_dist' )
CONF_KEYS=( 'node.data_dir' 'node.name' 'node.cookie' 'node.db_backend' 'cluster.proto_dist' 'node.dist_net_ticktime' )
if [ "$IS_ENTERPRISE" = 'yes' ]; then
CONF_KEYS+=( 'license.key' )
fi
# Turn off debug as the ps output can be quite noisy
set +x
if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
[ -f "$EMQX_ETC_DIR"/emqx.conf ] || die "emqx.conf is not found in $EMQX_ETC_DIR" 1
maybe_use_portable_dynlibs
if [ "${EMQX_BOOT_CONFIGS:-}" = '' ]; then
EMQX_BOOT_CONFIGS="$(call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf multi_get "${CONF_KEYS[@]}")"
## export here so the 'console' command recursively called from
@ -464,37 +465,68 @@ if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
export EMQX_BOOT_CONFIGS
fi
else
# For non-boot commands, we try to get data_dir and ssl_dist_optfile from 'ps -ef' output
# For non-boot commands, we need below runtime facts to connect to the running node:
# 1. The running node name;
# 2. The Erlang cookie in use by the running node name;
# 3. SSL options if the node is using TLS for Erlang distribution;
# 4. Erlang kernel application's net_ticktime config.
#
# There are 3 sources of truth to get those runtime information.
# Listed in the order of preference:
# 1. The boot command (which can be inspected from 'ps -ef' command output)
# 2. The generated vm.<time>.config file located in the dir pointed by 'node.data_dir'
# 3. The bootstrap config 'etc/emqx.conf'
#
# If failed to read from source 1, the information is retrieved from source 3
# i.e. source 2 is never used.
#
# NOTES:
# * We should avoid getting runtime information with the 3rd approach because 'etc/emqx.conf' might
# be updated after the node is started. e.g. If a user starts the node with name 'emqx@127.0.0.1'
# then update the config in the file to 'node.name = "emqx@local.net"', after this change,
# there would be no way stop the running node 'emqx@127.0.0.1', because 'emqx stop' command
# would try to stop the new node instead.
# * The primary grep pattern is $RUNNER_ROOT_DIR because one can start multiple nodes at the same time
# * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
# * The running 'remsh' and 'nodetool' processes must be excluded
# shellcheck disable=SC2009
PS_LINE="$(ps -ef | grep "\-[r]oot $RUNNER_ROOT_DIR" || true)"
if [ "$(echo -e "$PS_LINE" | wc -l)" -eq 1 ]; then
## only one emqx node is running
## strip 'emqx_data_dir ' and ' --' because the dir in between may contain spaces
DATA_DIR="$(echo -e "$PS_LINE" | grep -oE "\-emqx_data_dir.*" | sed -E 's#.+emqx_data_dir[[:blank:]]##g' | sed -E 's#[[:blank:]]--$##g' || true)"
if [ "$DATA_DIR" = '' ]; then
## this should not happen unless -emqx_data_dir is not set
die "node_is_not_running!" 1
fi
# get ssl_dist_optfile option
SSL_DIST_OPTFILE="$(echo -e "$PS_LINE" | grep -oE '\-ssl_dist_optfile\s.+\s' | awk '{print $2}' || true)"
if [ -z "$SSL_DIST_OPTFILE" ]; then
EMQX_BOOT_CONFIGS="node.data_dir=${DATA_DIR}\ncluster.proto_dist=inet_tcp"
PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
[ "$DEBUG" -eq 1 ] && echo "EMQX processes: $PS_LINE"
running_nodes_count="$(echo -e "$PS_LINE" | wc -l)"
if [ "$running_nodes_count" -eq 1 ]; then
## only one emqx node is running, get running args from 'ps -ef' output
tmp_nodename=$(echo -e "$PS_LINE" | $GREP -oE "\s\-s?name.*" | awk '{print $2}' || true)
tmp_cookie=$(echo -e "$PS_LINE" | $GREP -oE "\s\-setcookie.*" | awk '{print $2}' || true)
tmp_dist="$(echo -e "$PS_LINE" | $GREP -oE '\-ssl_dist_optfile\s.+\s' | awk '{print $2}' || true)"
tmp_ticktime="$(echo -e "$PS_LINE" | $GREP -oE '\s\-kernel\snet_ticktime\s.+\s' | awk '{print $3}' || true)"
# data_dir is actually not needed, but kept anyway
tmp_datadir="$(echo -e "$PS_LINE" | $GREP -oE "\-emqx_data_dir.*" | sed -E 's#.+emqx_data_dir[[:blank:]]##g' | sed -E 's#[[:blank:]]--$##g' || true)"
if [ -z "$tmp_dist" ]; then
tmp_proto='inet_tcp'
else
EMQX_BOOT_CONFIGS="node.data_dir=${DATA_DIR}\ncluster.proto_dist=inet_tls"
tmp_proto='inet_tls'
fi
## Make the format like what call_hocon multi_get prints out, but only need 4 args
EMQX_BOOT_CONFIGS="node.name=${tmp_nodename}\nnode.cookie=${tmp_cookie}\ncluster.proto_dist=${tmp_proto}\nnode.dist_net_ticktime=$tmp_ticktime\nnode.data_dir=${tmp_datadir}"
else
## None or more than one node is running, resolve from boot config
## we have no choiece but to read the bootstrap config (with environment overrides available in the current shell)
[ -f "$EMQX_ETC_DIR"/emqx.conf ] || die "emqx.conf is not found in $EMQX_ETC_DIR" 1
maybe_use_portable_dynlibs
EMQX_BOOT_CONFIGS="$(call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf multi_get "${CONF_KEYS[@]}")"
fi
fi
[ "$DEBUG" -eq 1 ] && echo "EMQX_BOOT_CONFIGS: $EMQX_BOOT_CONFIGS"
[ "$DEBUG" -eq 1 ] && set -x
get_boot_config() {
path_to_value="$1"
echo -e "$EMQX_BOOT_CONFIGS" | grep "$path_to_value=" | sed -e "s/$path_to_value=//g" | tr -d \"
echo -e "$EMQX_BOOT_CONFIGS" | $GREP "$path_to_value=" | sed -e "s/$path_to_value=//g" | tr -d \"
}
EPMD_ARGS="-start_epmd false -epmd_module ekka_epmd -proto_dist ekka"
PROTO_DIST="$(get_boot_config 'cluster.proto_dist' || true)"
TICKTIME="$(get_boot_config 'node.dist_net_ticktime' || echo '120')"
# this environment variable is required by ekka_dist module
# because proto_dist is overriden to ekka, and there is a lack of ekka_tls module
export EKKA_PROTO_DIST_MOD="${PROTO_DIST:-inet_tcp}"
@ -717,7 +749,7 @@ generate_config() {
ARG_KEY=$(echo "$ARG_LINE" | awk '{$NF="";print}')
ARG_VALUE=$(echo "$ARG_LINE" | awk '{print $NF}')
## use the key to look up in vm.args file for the value
TMP_ARG_VALUE=$(grep "^$ARG_KEY" "$TMP_ARG_FILE" || true | awk '{print $NF}')
TMP_ARG_VALUE=$($GREP "^$ARG_KEY" "$TMP_ARG_FILE" || true | awk '{print $NF}')
## compare generated (to override) value to original (to be overridden) value
if [ "$ARG_VALUE" != "$TMP_ARG_VALUE" ] ; then
## if they are different
@ -742,7 +774,7 @@ is_down() {
if ps -p "$PID" >/dev/null; then
# still around
# shellcheck disable=SC2009 # this grep pattern is not a part of the progra names
if ps -p "$PID" | grep -q 'defunct'; then
if ps -p "$PID" | $GREP -q 'defunct'; then
# zombie state, print parent pid
parent="$(ps -o ppid= -p "$PID" | tr -d ' ')"
logwarn "$PID is marked <defunct>, parent: $(ps -p "$parent")"
@ -792,23 +824,6 @@ wait_until_return_val() {
done
}
latest_vm_args() {
local hint_var_name="$1"
local vm_args_file
vm_args_file="$(find "$CONFIGS_DIR" -type f -name "vm.*.args" | sort | tail -1)"
if [ -f "$vm_args_file" ]; then
echo "$vm_args_file"
else
set +x
logerr "Node not initialized?"
logerr "Generated config file vm.*.args is not found for command '$COMMAND'"
logerr "in config dir: $CONFIGS_DIR"
logerr "In case the file has been deleted while the node is running,"
logerr "set environment variable '$hint_var_name' to continue"
exit 1
fi
}
# backward compatible with 4.x
tr_log_to_env() {
local log_to=${EMQX_LOG__TO:-undefined}
@ -851,6 +866,7 @@ maybe_log_to_console() {
fi
}
## To be backward compatible, read and then unset EMQX_NODE_NAME
if [ -n "${EMQX_NODE_NAME:-}" ]; then
export EMQX_NODE__NAME="${EMQX_NODE_NAME}"
unset EMQX_NODE_NAME
@ -862,13 +878,7 @@ fi
## or long name (with '@') e.g. 'emqx@example.net' or 'emqx@127.0.0.1'
NAME="${EMQX_NODE__NAME:-}"
if [ -z "$NAME" ]; then
if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
# for boot commands, inspect emqx.conf for node name
NAME="$(get_boot_config 'node.name')"
else
vm_args_file="$(latest_vm_args 'EMQX_NODE__NAME')"
NAME="$(grep -E '^-s?name' "${vm_args_file}" | awk '{print $2}')"
fi
fi
# force to use 'emqx' short name
@ -883,7 +893,7 @@ case "$NAME" in
esac
SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')"
if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then
echo "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
exit 1
fi
# This also changes the program name from 'beam.smp' to node name
@ -894,18 +904,13 @@ PIPE_DIR="${PIPE_DIR:-/$DATA_DIR/${WHOAMI}_erl_pipes/$NAME/}"
## Resolve Erlang cookie.
if [ -n "${EMQX_NODE_COOKIE:-}" ]; then
## To be backward compatible, read EMQX_NODE_COOKIE
## To be backward compatible, read and unset EMQX_NODE_COOKIE
export EMQX_NODE__COOKIE="${EMQX_NODE_COOKIE}"
unset EMQX_NODE_COOKIE
fi
COOKIE="${EMQX_NODE__COOKIE:-}"
if [ -z "$COOKIE" ]; then
if [ "$IS_BOOT_COMMAND" = 'yes' ]; then
COOKIE="$(get_boot_config 'node.cookie')"
else
vm_args_file="$(latest_vm_args 'EMQX_NODE__COOKIE')"
COOKIE="$(grep -E '^-setcookie' "${vm_args_file}" | awk '{print $2}')"
fi
fi
[ -z "$COOKIE" ] && COOKIE="$EMQX_DEFAULT_ERLANG_COOKIE"
if [ $IS_BOOT_COMMAND = 'yes' ] && [ "$COOKIE" = "$EMQX_DEFAULT_ERLANG_COOKIE" ]; then
@ -930,7 +935,7 @@ case "${COMMAND}" in
start)
# Make sure a node IS not running
if relx_nodetool "ping" >/dev/null 2>&1; then
die "node_is_already_running!"
die "Node $NAME is already running!"
fi
# this flag passes down to console mode

View File

@ -0,0 +1 @@
Avoid parsing config files for node name and cookie when executing non-boot commands in bin/emqx.

View File

@ -0,0 +1 @@
在 bin/emqx 脚本中,避免在运行非启动命令时解析 emqx.conf 来获取节点名称和 cookie。