From 39d6f612cad142d8d8ffb008b0f8c5150668c069 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 31 May 2023 00:13:15 +0200 Subject: [PATCH 01/29] feat(emqx_release): add new APIs to parse/compare release versions --- apps/emqx/include/emqx_release.hrl | 4 +- apps/emqx/src/emqx_release.erl | 53 ++++++++++++++++++++++++- apps/emqx/test/emqx_release_tests.erl | 56 +++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 apps/emqx/test/emqx_release_tests.erl diff --git a/apps/emqx/include/emqx_release.hrl b/apps/emqx/include/emqx_release.hrl index 25b340f50..ecb0baabc 100644 --- a/apps/emqx/include/emqx_release.hrl +++ b/apps/emqx/include/emqx_release.hrl @@ -31,11 +31,11 @@ %% NOTE: ALso make sure to follow the instructions in end of %% `apps/emqx/src/bpapi/README.md' -%% Community edition +%% Opensource edition -define(EMQX_RELEASE_CE, "5.1.0-alpha.1"). %% Enterprise edition -define(EMQX_RELEASE_EE, "5.1.0-alpha.1"). -%% the HTTP API version +%% The HTTP API version -define(EMQX_API_VERSION, "5.0"). diff --git a/apps/emqx/src/emqx_release.erl b/apps/emqx/src/emqx_release.erl index 4ecf8598b..9a930649e 100644 --- a/apps/emqx/src/emqx_release.erl +++ b/apps/emqx/src/emqx_release.erl @@ -21,7 +21,10 @@ edition_vsn_prefix/0, edition_longstr/0, description/0, - version/0 + version/0, + version_with_prefix/0, + vsn_compare/1, + vsn_compare/2 ]). -include("emqx_release.hrl"). @@ -68,6 +71,10 @@ edition_vsn_prefix() -> edition_longstr() -> maps:get(edition(), ?EMQX_REL_NAME). +%% @doc Return the release version with prefix. +version_with_prefix() -> + edition_vsn_prefix() ++ version(). + %% @doc Return the release version. version() -> case lists:keyfind(emqx_vsn, 1, ?MODULE:module_info(compile)) of @@ -92,3 +99,47 @@ version() -> build_vsn() -> maps:get(edition(), ?EMQX_REL_VSNS). + +%% @doc Compare the given version with the current running version, +%% return 'newer' 'older' or 'same'. +vsn_compare("v" ++ Vsn) -> + vsn_compare(?EMQX_RELEASE_CE, Vsn); +vsn_compare("e" ++ Vsn) -> + vsn_compare(?EMQX_RELEASE_EE, Vsn). + +%% @priv Compare the second argument with the first argument, return +%% 'newer' 'older' or 'same' semver comparison result. +vsn_compare(Vsn1, Vsn2) -> + ParsedVsn1 = parse_vsn(Vsn1), + ParsedVsn2 = parse_vsn(Vsn2), + case ParsedVsn1 =:= ParsedVsn2 of + true -> + same; + false when ParsedVsn1 < ParsedVsn2 -> + newer; + false -> + older + end. + +%% @priv Parse the version string to a tuple. +%% Return {{Major, Minor, Patch}, Suffix}. +%% Where Suffix is either an empty string or a tuple like {"rc", 1}. +%% NOTE: taking the nature ordering of the suffix: +%% {"alpha", _} < {"beta", _} < {"rc", _} < "" +parse_vsn(Vsn) -> + try + [V1, V2, V3 | Suffix0] = string:tokens(Vsn, ".-"), + Suffix = + case Suffix0 of + "" -> + %% For the case like "5.1.0" + ""; + [ReleaseStage, Number] -> + %% For the case like "5.1.0-rc.1" + {ReleaseStage, list_to_integer(Number)} + end, + {{list_to_integer(V1), list_to_integer(V2), list_to_integer(V3)}, Suffix} + catch + _:_ -> + erlang:error({invalid_version_string, Vsn}) + end. diff --git a/apps/emqx/test/emqx_release_tests.erl b/apps/emqx/test/emqx_release_tests.erl new file mode 100644 index 000000000..45d9a53c4 --- /dev/null +++ b/apps/emqx/test/emqx_release_tests.erl @@ -0,0 +1,56 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_release_tests). + +-include_lib("eunit/include/eunit.hrl"). + +vsn_compre_test_() -> + CurrentVersion = emqx_release:version_with_prefix(), + [ + {"must be 'same' when comparing with current version", fun() -> + ?assertEqual(same, emqx_release:vsn_compare(CurrentVersion)) + end}, + {"must be 'same' when comparing same version strings", fun() -> + ?assertEqual(same, emqx_release:vsn_compare("1.1.1", "1.1.1")) + end}, + {"1.1.1 is older than 1.1.2", fun() -> + ?assertEqual(older, emqx_release:vsn_compare("1.1.2", "1.1.1")), + ?assertEqual(newer, emqx_release:vsn_compare("1.1.1", "1.1.2")) + end}, + {"1.1.9 is older than 1.1.10", fun() -> + ?assertEqual(older, emqx_release:vsn_compare("1.1.10", "1.1.9")), + ?assertEqual(newer, emqx_release:vsn_compare("1.1.9", "1.1.10")) + end}, + {"alpha is older than beta", fun() -> + ?assertEqual(older, emqx_release:vsn_compare("1.1.1-beta.1", "1.1.1-alpha.2")), + ?assertEqual(newer, emqx_release:vsn_compare("1.1.1-alpha.2", "1.1.1-beta.1")) + end}, + {"beta is older than rc", fun() -> + ?assertEqual(older, emqx_release:vsn_compare("1.1.1-rc.1", "1.1.1-beta.2")), + ?assertEqual(newer, emqx_release:vsn_compare("1.1.1-beta.2", "1.1.1-rc.1")) + end}, + {"rc is older than official cut", fun() -> + ?assertEqual(older, emqx_release:vsn_compare("1.1.1", "1.1.1-rc.1")), + ?assertEqual(newer, emqx_release:vsn_compare("1.1.1-rc.1", "1.1.1")) + end}, + {"invalid version string will crash", fun() -> + ?assertError({invalid_version_string, "1.1.a"}, emqx_release:vsn_compare("v1.1.a")), + ?assertError( + {invalid_version_string, "1.1.1-alpha"}, emqx_release:vsn_compare("e1.1.1-alpha") + ) + end} + ]. From 332daabcc53a7f1f171cc9ae47fcf72328a9523f Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 31 May 2023 14:44:56 +0200 Subject: [PATCH 02/29] fix(config): Do not sync cluster config from nodes running new version --- apps/emqx_conf/src/emqx_conf_app.erl | 192 ++++++++++++-------- apps/emqx_conf/test/emqx_conf_app_SUITE.erl | 39 ++++ 2 files changed, 159 insertions(+), 72 deletions(-) diff --git a/apps/emqx_conf/src/emqx_conf_app.erl b/apps/emqx_conf/src/emqx_conf_app.erl index 70234b525..363e367b6 100644 --- a/apps/emqx_conf/src/emqx_conf_app.erl +++ b/apps/emqx_conf/src/emqx_conf_app.erl @@ -42,6 +42,8 @@ start(_StartType, _StartArgs) -> stop(_State) -> ok. +%% Read the cluster config from the local node. +%% This function is named 'override' due to historical reasons. get_override_config_file() -> Node = node(), case emqx_app:get_init_config_load_done() of @@ -63,7 +65,7 @@ get_override_config_file() -> tnx_id => TnxId, node => Node, has_deprecated_file => HasDeprecateFile, - release => emqx_app:get_release() + release => emqx_release:version_with_prefix() } end, case mria:ro_transaction(?CLUSTER_RPC_SHARD, Fun) of @@ -95,7 +97,7 @@ init_conf() -> %% Workaround for https://github.com/emqx/mria/issues/94: _ = mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1000), _ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]), - {ok, TnxId} = copy_override_conf_from_core_node(), + {ok, TnxId} = sync_cluster_conf(), _ = emqx_app:set_init_tnx_id(TnxId), ok = init_load(), ok = emqx_app:set_init_config_load_done(). @@ -103,88 +105,134 @@ init_conf() -> cluster_nodes() -> mria:cluster_nodes(cores) -- [node()]. -copy_override_conf_from_core_node() -> +%% @doc Try to sync the cluster config from other core nodes. +sync_cluster_conf() -> case cluster_nodes() of - %% The first core nodes is self. [] -> - ?SLOG(debug, #{msg => "skip_copy_override_conf_from_core_node"}), + %% The first core nodes is self. + ?SLOG(debug, #{ + msg => "skip_sync_cluster_conf", reason => "Running single node" + }), {ok, ?DEFAULT_INIT_TXN_ID}; Nodes -> - {Results, Failed} = emqx_conf_proto_v2:get_override_config_file(Nodes), - {Ready, NotReady0} = lists:partition(fun(Res) -> element(1, Res) =:= ok end, Results), - NotReady = lists:filter(fun(Res) -> element(1, Res) =:= error end, NotReady0), - case (Failed =/= [] orelse NotReady =/= []) andalso Ready =/= [] of + sync_cluster_conf2(Nodes) + end. + +%% @priv Some core nodes are running, try to sync the cluster config from them. +sync_cluster_conf2(Nodes) -> + {Results, Failed} = emqx_conf_proto_v2:get_override_config_file(Nodes), + {Ready, NotReady0} = lists:partition(fun(Res) -> element(1, Res) =:= ok end, Results), + NotReady = lists:filter(fun(Res) -> element(1, Res) =:= error end, NotReady0), + case (Failed =/= [] orelse NotReady =/= []) of + true when Ready =/= [] -> + %% Some core nodes failed to reply. + Warning = #{ + nodes => Nodes, + failed => Failed, + not_ready => NotReady, + msg => "ignored_nodes_when_sync_cluster_conf" + }, + ?SLOG(warning, Warning); + true -> + %% There are core nodes running but no one was able to reply. + ?SLOG(error, #{ + msg => "failed_to_sync_cluster_conf", + nodes => Nodes, + failed => Failed, + not_ready => NotReady + }); + false -> + ok + end, + case Ready of + [] -> + case should_proceed_with_boot() of true -> - Warning = #{ - nodes => Nodes, - failed => Failed, - not_ready => NotReady, - msg => "ignored_bad_nodes_when_copy_init_config" - }, - ?SLOG(warning, Warning); - false -> - ok - end, - case Ready of - [] -> - %% Other core nodes running but no one replicated it successfully. - ?SLOG(error, #{ - msg => "copy_override_conf_from_core_node_failed", + %% Act as if this node is alone, so it can + %% finish the boot sequence and load the + %% config for other nodes to copy it. + ?SLOG(info, #{ + msg => "skip_sync_cluster_conf", + loading_from_disk => true, nodes => Nodes, failed => Failed, not_ready => NotReady }), - - case should_proceed_with_boot() of - true -> - %% Act as if this node is alone, so it can - %% finish the boot sequence and load the - %% config for other nodes to copy it. - ?SLOG(info, #{ - msg => "skip_copy_override_conf_from_core_node", - loading_from_disk => true, - nodes => Nodes, - failed => Failed, - not_ready => NotReady - }), - {ok, ?DEFAULT_INIT_TXN_ID}; - false -> - %% retry in some time - Jitter = rand:uniform(2000), - Timeout = 10000 + Jitter, - ?SLOG(info, #{ - msg => "copy_cluster_conf_from_core_node_retry", - timeout => Timeout, - nodes => Nodes, - failed => Failed, - not_ready => NotReady - }), - timer:sleep(Timeout), - copy_override_conf_from_core_node() - end; - _ -> - [{ok, Info} | _] = lists:sort(fun conf_sort/2, Ready), - #{node := Node, conf := RawOverrideConf, tnx_id := TnxId} = Info, - HasDeprecatedFile = has_deprecated_file(Info), - ?SLOG(debug, #{ - msg => "copy_cluster_conf_from_core_node_success", - node => Node, - has_deprecated_file => HasDeprecatedFile, - local_release => emqx_app:get_release(), - remote_release => maps:get(release, Info, "before_v5.0.24|e5.0.3"), - data_dir => emqx:data_dir(), - tnx_id => TnxId + {ok, ?DEFAULT_INIT_TXN_ID}; + false -> + %% retry in some time + Jitter = rand:uniform(2000), + Timeout = 10000 + Jitter, + timer:sleep(Timeout), + ?SLOG(warning, #{ + msg => "sync_cluster_conf_retry", + timeout => Timeout, + nodes => Nodes, + failed => Failed, + not_ready => NotReady }), - ok = emqx_config:save_to_override_conf( - HasDeprecatedFile, - RawOverrideConf, - #{override_to => cluster} - ), - ok = sync_data_from_node(Node), - {ok, TnxId} - end + sync_cluster_conf() + end; + _ -> + sync_cluster_conf3(Ready) end. +%% @priv Filter out the nodes which are running a newer version than this node. +sync_cluster_conf3(Ready) -> + NotNewer = fun({ok, #{release := RemoteRelease}}) -> + try + emqx_release:vsn_compare(RemoteRelease) =/= newer + catch + _:_ -> + %% If the version is not valid (without v or e prefix), + %% we know it's older than v5.1.0/e5.1.0 + true + end + end, + case lists:filter(NotNewer, Ready) of + [] -> + %% All available core nodes are running a newer version than this node. + %% Start this node without syncing cluster config from them. + %% This is likely a restart of an older version node during cluster upgrade. + NodesAndVersions = lists:map( + fun({ok, #{node := Node, release := Release}}) -> + #{node => Node, version => Release} + end, + Ready + ), + ?SLOG(warning, #{ + msg => "all_available_nodes_running_newer_version", + hint => "Booting this node without syncing cluster config from peer core nodes", + peer_nodes => NodesAndVersions + }), + {ok, ?DEFAULT_INIT_TXN_ID}; + Ready2 -> + sync_cluster_conf4(Ready2) + end. + +%% @priv Some core nodes are running and replied with their configs successfully. +%% Try to sort the results and save the first one for local use. +sync_cluster_conf4(Ready) -> + [{ok, Info} | _] = lists:sort(fun conf_sort/2, Ready), + #{node := Node, conf := RawOverrideConf, tnx_id := TnxId} = Info, + HasDeprecatedFile = has_deprecated_file(Info), + ?SLOG(debug, #{ + msg => "sync_cluster_conf_success", + synced_from_node => Node, + has_deprecated_file => HasDeprecatedFile, + local_release => emqx_app:get_release(), + remote_release => maps:get(release, Info, "before_v5.0.24|e5.0.3"), + data_dir => emqx:data_dir(), + tnx_id => TnxId + }), + ok = emqx_config:save_to_override_conf( + HasDeprecatedFile, + RawOverrideConf, + #{override_to => cluster} + ), + ok = sync_data_from_node(Node), + {ok, TnxId}. + should_proceed_with_boot() -> TablesStatus = emqx_cluster_rpc:get_tables_status(), LocalNode = node(), diff --git a/apps/emqx_conf/test/emqx_conf_app_SUITE.erl b/apps/emqx_conf/test/emqx_conf_app_SUITE.erl index 583405158..95b4ce697 100644 --- a/apps/emqx_conf/test/emqx_conf_app_SUITE.erl +++ b/apps/emqx_conf/test/emqx_conf_app_SUITE.erl @@ -98,6 +98,34 @@ t_copy_deprecated_data_dir(_Config) -> stop_cluster(Nodes) end. +t_no_copy_from_newer_version_node(_Config) -> + net_kernel:start(['master2@127.0.0.1', longnames]), + ct:timetrap({seconds, 120}), + snabbkaffe:fix_ct_logging(), + Cluster = cluster([cluster_spec({core, 10}), cluster_spec({core, 11}), cluster_spec({core, 12})]), + OKs = [ok, ok, ok], + [First | Rest] = Nodes = start_cluster(Cluster), + try + File = "/configs/cluster.hocon", + assert_config_load_done(Nodes), + rpc:call(First, ?MODULE, create_data_dir, [File]), + {OKs, []} = rpc:multicall(Nodes, application, stop, [emqx_conf]), + {OKs, []} = rpc:multicall(Nodes, ?MODULE, set_data_dir_env, []), + {OKs, []} = rpc:multicall(Nodes, meck, new, [ + emqx_release, [passthrough, no_history, no_link, non_strict] + ]), + %% 99.9.9 is always newer than the current version + {OKs, []} = rpc:multicall(Nodes, meck, expect, [ + emqx_release, version_with_prefix, 0, "e99.9.9" + ]), + ok = rpc:call(First, application, start, [emqx_conf]), + {[ok, ok], []} = rpc:multicall(Rest, application, start, [emqx_conf]), + ok = assert_no_cluster_conf_copied(Rest, File), + stop_cluster(Nodes), + ok + after + stop_cluster(Nodes) + end. %%------------------------------------------------------------------------------ %% Helper functions %%------------------------------------------------------------------------------ @@ -158,6 +186,17 @@ assert_data_copy_done([First0 | Rest], File) -> Rest ). +assert_no_cluster_conf_copied([], _) -> + ok; +assert_no_cluster_conf_copied([Node | Nodes], File) -> + NodeStr = atom_to_list(Node), + ?assertEqual( + {error, enoent}, + file:read_file(NodeStr ++ File), + #{node => Node} + ), + assert_no_cluster_conf_copied(Nodes, File). + assert_config_load_done(Nodes) -> lists:foreach( fun(Node) -> From 5146de5b1cad540e1cd2f0f75a5a3cdf84ca0308 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 31 May 2023 17:34:39 +0200 Subject: [PATCH 03/29] feat: add a backup copies for cluster.hocon --- apps/emqx/src/emqx_config.erl | 94 +++++++++++++++++++++------- apps/emqx/test/emqx_config_SUITE.erl | 66 ++++++++++++++++++- 2 files changed, 136 insertions(+), 24 deletions(-) diff --git a/apps/emqx/src/emqx_config.erl b/apps/emqx/src/emqx_config.erl index 91809134c..c84a1fb9c 100644 --- a/apps/emqx/src/emqx_config.erl +++ b/apps/emqx/src/emqx_config.erl @@ -91,7 +91,7 @@ -export([ensure_atom_conf_path/2]). -ifdef(TEST). --export([erase_all/0]). +-export([erase_all/0, backup_and_write/2]). -endif. -include("logger.hrl"). @@ -105,6 +105,7 @@ -define(LISTENER_CONF_PATH(TYPE, LISTENER, PATH), [listeners, TYPE, LISTENER | PATH]). -define(CONFIG_NOT_FOUND_MAGIC, '$0tFound'). +-define(MAX_KEEP_BACKUP_CONFIGS, 10). -export_type([ update_request/0, @@ -601,43 +602,94 @@ save_to_config_map(Conf, RawConf) -> -spec save_to_override_conf(boolean(), raw_config(), update_opts()) -> ok | {error, term()}. save_to_override_conf(_, undefined, _) -> ok; -%% TODO: Remove deprecated override conf file when 5.1 save_to_override_conf(true, RawConf, Opts) -> case deprecated_conf_file(Opts) of undefined -> ok; FileName -> - ok = filelib:ensure_dir(FileName), - case file:write_file(FileName, hocon_pp:do(RawConf, #{})) of - ok -> - ok; - {error, Reason} -> - ?SLOG(error, #{ - msg => "failed_to_write_override_file", - filename => FileName, - reason => Reason - }), - {error, Reason} - end + backup_and_write(FileName, hocon_pp:do(RawConf, #{})) end; save_to_override_conf(false, RawConf, _Opts) -> case cluster_hocon_file() of undefined -> ok; FileName -> - ok = filelib:ensure_dir(FileName), - case file:write_file(FileName, hocon_pp:do(RawConf, #{})) of + backup_and_write(FileName, hocon_pp:do(RawConf, #{})) + end. + +%% @priv This is the same human-readable timestamp format as +%% hocon-cli generated app.