From 28021c466a3756ab938bc37a127323bdc197b746 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 30 Mar 2023 12:31:34 +0200 Subject: [PATCH 1/5] chore(bin/emqx): do not use -r option in rm command when deleting .siz --- bin/emqx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/emqx b/bin/emqx index c0601ec85..0bfe94d4c 100755 --- a/bin/emqx +++ b/bin/emqx @@ -766,7 +766,7 @@ generate_config() { local node_name="$2" ## Delete the *.siz files first or it can't start after ## changing the config 'log.rotation.size' - rm -rf "${RUNNER_LOG_DIR}"/*.siz + rm -f "${RUNNER_LOG_DIR}"/*.siz ## timestamp for each generation local NOW_TIME From 494e4b639a82168f7b6bc4e9bffe77f0c78d50ac Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 30 Mar 2023 12:52:51 +0200 Subject: [PATCH 2/5] chore: exit with non-zero status if config initialization failed --- apps/emqx/src/emqx_app.erl | 4 ++++ apps/emqx_conf/src/emqx_cluster_rpc.erl | 6 ++++++ apps/emqx_conf/src/emqx_conf_app.erl | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/apps/emqx/src/emqx_app.erl b/apps/emqx/src/emqx_app.erl index 6188d8030..77ece1c60 100644 --- a/apps/emqx/src/emqx_app.erl +++ b/apps/emqx/src/emqx_app.erl @@ -72,9 +72,13 @@ set_init_config_load_done() -> get_init_config_load_done() -> application:get_env(emqx, init_config_load_done, false). +%% @doc Set the transaction id from which this node should start applying after boot. +%% The transaction ID is received from the core node which we just copied the latest +%% config from. set_init_tnx_id(TnxId) -> application:set_env(emqx, cluster_rpc_init_tnx_id, TnxId). +%% @doc Get the transaction id from which this node should start applying after boot. get_init_tnx_id() -> application:get_env(emqx, cluster_rpc_init_tnx_id, -1). diff --git a/apps/emqx_conf/src/emqx_cluster_rpc.erl b/apps/emqx_conf/src/emqx_cluster_rpc.erl index 89f678554..f7c34031c 100644 --- a/apps/emqx_conf/src/emqx_cluster_rpc.erl +++ b/apps/emqx_conf/src/emqx_cluster_rpc.erl @@ -275,8 +275,13 @@ init([Node, RetryMs]) -> _ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]), {ok, _} = mnesia:subscribe({table, ?CLUSTER_MFA, simple}), State = #{node => Node, retry_interval => RetryMs}, + %% The init transaction ID is set in emqx_conf_app after + %% it has fetched the latest config from one of the core nodes TnxId = emqx_app:get_init_tnx_id(), ok = maybe_init_tnx_id(Node, TnxId), + %% Now continue with the normal catch-up process + %% That is: apply the missing transactions after the config + %% was copied until now. {ok, State, {continue, ?CATCH_UP}}. %% @private @@ -396,6 +401,7 @@ get_cluster_tnx_id() -> Id -> Id end. +%% The entry point of a config change transaction. init_mfa(Node, MFA) -> mnesia:write_lock_table(?CLUSTER_MFA), LatestId = get_cluster_tnx_id(), diff --git a/apps/emqx_conf/src/emqx_conf_app.erl b/apps/emqx_conf/src/emqx_conf_app.erl index 09478e304..0896eb718 100644 --- a/apps/emqx_conf/src/emqx_conf_app.erl +++ b/apps/emqx_conf/src/emqx_conf_app.erl @@ -38,7 +38,7 @@ start(_StartType, _StartArgs) -> reason => E, stacktrace => St }), - init:stop() + init:stop(1) end, ok = emqx_config_logger:refresh_config(), emqx_conf_sup:start_link(). From 75817e23bd68babbba8b3d2ecbb10e23530cae11 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 30 Mar 2023 14:03:23 +0200 Subject: [PATCH 3/5] test: add some tests for EMQX boot failures --- scripts/test/emqx-boot.bats | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 scripts/test/emqx-boot.bats diff --git a/scripts/test/emqx-boot.bats b/scripts/test/emqx-boot.bats new file mode 100644 index 000000000..96f9a4457 --- /dev/null +++ b/scripts/test/emqx-boot.bats @@ -0,0 +1,21 @@ +#!/usr/bin/env bats + +# https://github.com/bats-core/bats-core +# env PROFILE=emqx bats -t -p --verbose-run scripts/test/emqx-boot.bats + +@test "PROFILE must be set" { + [[ -n "$PROFILE" ]] +} + +@test "emqx boot with invalid node name" { + output="$(env EMQX_NODE_NAME="invliadename#" ./_build/$PROFILE/rel/emqx/bin/emqx console 2>&1|| true)" + [[ "$output" =~ "ERROR: Invalid node name,".+ ]] +} + +@test "corrupted cluster config file" { + conffile="./_build/$PROFILE/rel/emqx/data/configs/cluster-override.conf" + echo "{" > $conffile + run ./_build/$PROFILE/rel/emqx/bin/emqx console + [[ $status -ne 0 ]] + rm -f $conffile +} From da8794ede03762aaf70f71f0373393324037d70e Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 30 Mar 2023 14:17:08 +0200 Subject: [PATCH 4/5] docs: Add change logs --- changes/ce/fix-10286.en.md | 2 ++ changes/ce/fix-10286.zh.md | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 changes/ce/fix-10286.en.md create mode 100644 changes/ce/fix-10286.zh.md diff --git a/changes/ce/fix-10286.en.md b/changes/ce/fix-10286.en.md new file mode 100644 index 000000000..1bf721e55 --- /dev/null +++ b/changes/ce/fix-10286.en.md @@ -0,0 +1,2 @@ +Enhance logging behaviour during boot failure. +When EMQX fails to start due to a corrupted configuration files, excessive logging is eliminated and no crash dump file is generated. diff --git a/changes/ce/fix-10286.zh.md b/changes/ce/fix-10286.zh.md new file mode 100644 index 000000000..83455b8fd --- /dev/null +++ b/changes/ce/fix-10286.zh.md @@ -0,0 +1,2 @@ +优化启动失败的错误日志。 +如果 EMQX 因为损坏的配置文件无法启动时,不会再打印过多的错误日志,也不再生成 crash.dump 文件。 From 14039e393a7f2d41400bdfce638b53880d6d68d7 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Thu, 30 Mar 2023 17:11:10 +0200 Subject: [PATCH 5/5] docs: Update changes/ce/fix-10286.en.md Co-authored-by: Thales Macedo Garitezi --- changes/ce/fix-10286.en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/ce/fix-10286.en.md b/changes/ce/fix-10286.en.md index 1bf721e55..3a51fefe2 100644 --- a/changes/ce/fix-10286.en.md +++ b/changes/ce/fix-10286.en.md @@ -1,2 +1,2 @@ Enhance logging behaviour during boot failure. -When EMQX fails to start due to a corrupted configuration files, excessive logging is eliminated and no crash dump file is generated. +When EMQX fails to start due to corrupted configuration files, excessive logging is eliminated and no crash dump file is generated.