From ce902b5405796b26219a1c0fa093ae282fc9d8e5 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sun, 17 Apr 2022 11:51:38 +0200 Subject: [PATCH] refactor(bin/emqx): speed up boot The main slow-down is the overheads of booting up beam with the 'start_clean' boot file (which loads all modules). Prior to this change, beam is started multiple times in order to resolve configuration values. After this change: * For boot commands such as 'start', 'console' and 'foreground', it starts beam twice: - 1st is to check platform compatibility - 2nd is to resolve all configs required for boot in a batch * For non-boot commands, such as 'ctl' and 'ping', it does not require to start beam for config resolution at all --- apps/emqx_conf/src/emqx_conf_schema.erl | 2 +- bin/emqx | 236 ++++++++++++++---------- bin/emqx.cmd | 24 +-- build | 2 +- 4 files changed, 148 insertions(+), 116 deletions(-) diff --git a/apps/emqx_conf/src/emqx_conf_schema.erl b/apps/emqx_conf/src/emqx_conf_schema.erl index a4e13ec27..0fcbc7b32 100644 --- a/apps/emqx_conf/src/emqx_conf_schema.erl +++ b/apps/emqx_conf/src/emqx_conf_schema.erl @@ -351,7 +351,7 @@ fields("node") -> , 'readOnly' => true })} , {"dist_net_ticktime", - sc(emqx_schema:duration(), + sc(emqx_schema:duration_s(), #{ mapping => "vm_args.-kernel net_ticktime" , default => "2m" , 'readOnly' => true diff --git a/bin/emqx b/bin/emqx index 23b310cb1..1bde00914 100755 --- a/bin/emqx +++ b/bin/emqx @@ -10,6 +10,7 @@ if [ "$DEBUG" -eq 1 ]; then fi RUNNER_ROOT_DIR="$(cd "$(dirname "$(readlink "$0" || echo "$0")")"/..; pwd -P)" + # shellcheck disable=SC1090,SC1091 . "$RUNNER_ROOT_DIR"/releases/emqx_vars @@ -39,7 +40,7 @@ export ERTS_LIB_DIR="$RUNNER_ROOT_DIR/lib" DYNLIBS_DIR="$RUNNER_ROOT_DIR/dynlibs" # Echo to stderr on errors -echoerr() { echo "ERROR: $*" 1>&2; } +echoerr() { echo -e "$*" 1>&2; } die() { echoerr "ERROR: $1" @@ -53,20 +54,6 @@ assert_node_alive() { fi } - -# Echo to stderr on errors -echoerr() { echo "$*" 1>&2; } - -check_erlang_start() { - # RELEASE_LIB is used by Elixir - "$BINDIR/$PROGNAME" \ - -noshell \ - -boot_var RELEASE_LIB "$ERTS_LIB_DIR/lib" \ - -boot "$REL_DIR/start_clean" \ - -s crypto start \ - -s erlang halt -} - usage() { local command="$1" @@ -194,7 +181,7 @@ usage() { echo "More:" echo " Shell attach: remote_console | attach" echo " Up/Down-grade: upgrade | downgrade | install | uninstall" - echo " Install info: ertspath | root_dir | versions" + echo " Install info: ertspath | root_dir" echo " Runtime info: pid | ping | versions" echo " Advanced: console_clean | escript | rpc | rpcterms | eval | eval-erl" echo '' @@ -221,25 +208,23 @@ if [ "${2:-}" = 'help' ]; then fi fi -if ! check_erlang_start >/dev/null 2>&1; then - BUILD_INFO="$(cat "${REL_DIR}/BUILD_INFO")" - ## failed to start, might be due to missing libs, try to be portable - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-$DYNLIBS_DIR}" - if [ "$LD_LIBRARY_PATH" != "$DYNLIBS_DIR" ]; then - export LD_LIBRARY_PATH="$DYNLIBS_DIR:$LD_LIBRARY_PATH" - fi - deps_hint="Please make sure openssl-1.1.1 (libcrypto), libncurses and libatomic1 are installed." - if ! check_erlang_start; then - ## it's hopeless - echoerr "FATAL: Unable to start Erlang." - echoerr "$deps_hint" - echoerr "Also ensure it's running on the correct platform:" - echoerr "$BUILD_INFO" - exit 1 - fi - echoerr "Using libs from '${DYNLIBS_DIR}' due to missing from the OS." - echoerr "$deps_hint" -fi +## IS_BOOT_COMMAND is set for later to inspect node name and cookie from hocon config (or env variable) +case "${COMMAND}" in + start|console|console_clean|foreground) + IS_BOOT_COMMAND='yes' + ;; + ertspath) + echo "$ERTS_DIR" + exit 0 + ;; + root_dir) + echo "$RUNNER_ROOT_DIR" + exit 0 + ;; + *) + IS_BOOT_COMMAND='no' + ;; +esac ## backward compatible if [ -d "$ERTS_DIR/lib" ]; then @@ -272,6 +257,68 @@ if [ "$ES" -ne 0 ]; then exit $ES fi +COMPATIBILITY_CHECK=' + io:format("BEAM_OK~n", []), + try + [_|_] = L = crypto:info_lib(), + io:format("CRYPTO_OK ~0p~n", [L]) + catch + _ : _ -> + %% so logger has the chance to log something + timer:sleep(100), + halt(1) + + end, + try + mnesia_hook:module_info(), + io:format("MNESIA_OK~n", []) + catch + _ : _ -> + io:format("WARNING: Mnesia app has no post-coommit hook support~n", []), + halt(2) + end, + halt(0). +' + +compatiblity_info() { + # RELEASE_LIB is used by Elixir + "$BINDIR/$PROGNAME" \ + -noshell \ + -boot_var RELEASE_LIB "$ERTS_LIB_DIR/lib" \ + -boot "$REL_DIR/start_clean" \ + -eval "$COMPATIBILITY_CHECK" +} + +# Collect Eralng/OTP runtime sanity and compatibility in one go +if [ "$IS_BOOT_COMMAND" = 'yes' ]; then + # Read BUILD_INFO early as the next commands may mess up the shell + BUILD_INFO="$(cat "${REL_DIR}/BUILD_INFO")" + COMPATIBILITY_INFO="$(compatiblity_info 2>/dev/null || true)" + if ! (echo -e "$COMPATIBILITY_INFO" | grep -q 'CRYPTO_OK'); then + ## failed to start, might be due to missing libs, try to be portable + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-$DYNLIBS_DIR}" + if [ "$LD_LIBRARY_PATH" != "$DYNLIBS_DIR" ]; then + export LD_LIBRARY_PATH="$DYNLIBS_DIR:$LD_LIBRARY_PATH" + fi + COMPATIBILITY_INFO="$(compatiblity_info 2>&1 || true)" + if ! (echo -e "$COMPATIBILITY_INFO" | grep -q 'BEAM_OK'); then + ## not able to start beam.smp + echoerr "$COMPATIBILITY_INFO" + echoerr "Please ensure it is running on the correct platform:" + echoerr "$BUILD_INFO" + echoerr "Version=$REL_VSN" + echoerr "Required dependencies: openssl-1.1.1 (libcrypto), libncurses and libatomic1" + exit 1 + elif ! (echo -e "$COMPATIBILITY_INFO" | grep -q 'CRYPTO_OK'); then + ## not able to start crypto app + echoerr "$COMPATIBILITY_INFO" + exit 2 + fi + echoerr "Using libs from '${DYNLIBS_DIR}' due to missing from the OS." + fi + export COMPATIBILITY_INFO +fi + NO_EPMD="-start_epmd false -epmd_module ekka_epmd -proto_dist ekka" EPMD_ARGS="${EPMD_ARGS:-${NO_EPMD}}" @@ -356,22 +403,66 @@ call_hocon() { || die "call_hocon_failed: $*" $? } -get_config_value() { +## Resolve boot configs in a batch +## This is because starting the Erlang beam with all modules loaded +## and parsing HOCON config + environment variables is a non-trivial task +CONF_KEYS=( 'node.data_dir' 'node.name' 'node.cookie' 'db.backend' ) +if [ "$IS_ENTERPRISE" = 'yes' ]; then + CONF_KEYS+=( 'license.file' 'license.key' ) +fi + +if [ "$IS_BOOT_COMMAND" = 'yes' ]; then + if [ "${EMQX_BOOT_CONFIGS:-}" = '' ]; then + EMQX_BOOT_CONFIGS="$(call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf multi_get "${CONF_KEYS[@]}")" + ## export here so the 'console' command recursively called from + ## 'start' command does not have to parse the configs again + export EMQX_BOOT_CONFIGS + fi +else + # For non-boot commands, we try to get data_dir from ps -ef command + # shellcheck disable=SC2009 + PS_LINE="$(ps -ef | grep "\-[r]oot $RUNNER_ROOT_DIR" | grep -oE "\-emqx_data_dir.*"|| true)" + if [ "$(echo -e "$PS_LINE" | wc -l)" -eq 1 ]; then + ## only one emqx node is running + ## strip 'emqx_data_dir ' and ' --' because the dir in between may contain spaces + DATA_DIR="$(echo -e "$PS_LINE" | sed -e 's#.\+emqx_data_dir\s##g' | sed -e 's#\s--$##g')" + if [ "$DATA_DIR" = '' ]; then + ## this should not happen unless -emqx_data_dir is not set + die "node_is_not_running!" 1 + fi + EMQX_BOOT_CONFIGS="node.data_dir=$DATA_DIR" + else + ## None or more than one node is running, resolve from boot config + EMQX_BOOT_CONFIGS="$(call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf multi_get "${CONF_KEYS[@]}")" + fi +fi + +get_boot_config() { path_to_value="$1" - call_hocon -s "$SCHEMA_MOD" -c "$EMQX_ETC_DIR"/emqx.conf get "$path_to_value" | tr -d \" + echo -e "$EMQX_BOOT_CONFIGS" | grep "$path_to_value=" | sed -e "s/$path_to_value=//g" | tr -d \" } +DATA_DIR="$(get_boot_config 'node.data_dir')" +# ensure no trailing / +DATA_DIR="${DATA_DIR%/}" +if [[ $DATA_DIR != /* ]]; then + # relative path + DATA_DIR="${RUNNER_ROOT_DIR}/${DATA_DIR}" +fi +CONFIGS_DIR="$DATA_DIR/configs" +mkdir -p "$CONFIGS_DIR" + check_license() { if [ "$IS_ENTERPRISE" == "no" ]; then return 0 fi - file_license="${EMQX_LICENSE__FILE:-$(get_config_value license.file)}" + file_license="${EMQX_LICENSE__FILE:-$(get_boot_config 'license.file')}" if [[ -n "$file_license" && ("$file_license" != "undefined") ]]; then call_nodetool check_license_file "$file_license" else - key_license="${EMQX_LICENSE__KEY:-$(get_config_value license.key)}" + key_license="${EMQX_LICENSE__KEY:-$(get_boot_config 'license.key')}" if [[ -n "$key_license" && ("$key_license" != "undefined") ]]; then call_nodetool check_license_key "$key_license" @@ -396,15 +487,6 @@ relx_start_command() { "$START_OPTION" } -DATA_DIR="$(get_config_value 'node.data_dir')" -DATA_DIR="${DATA_DIR%/}" -if [[ $DATA_DIR != /* ]]; then - # relative - DATA_DIR="${RUNNER_ROOT_DIR}/${DATA_DIR}" -fi -CONFIGS_DIR="$DATA_DIR/configs" -mkdir -p "$CONFIGS_DIR" - # Function to generate app.config and vm.args # sets two environment variables CONF_FILE and ARGS_FILE generate_config() { @@ -416,7 +498,7 @@ generate_config() { ## timestamp for each generation local NOW_TIME - NOW_TIME="$(call_hocon now_time)" + NOW_TIME="$(date +'%Y.%m.%d.%H.%M.%S')" ## this command populates two files: app.