fix(config): Do not sync cluster config from nodes running new version
This commit is contained in:
parent
39d6f612ca
commit
332daabcc5
|
@ -42,6 +42,8 @@ start(_StartType, _StartArgs) ->
|
||||||
stop(_State) ->
|
stop(_State) ->
|
||||||
ok.
|
ok.
|
||||||
|
|
||||||
|
%% Read the cluster config from the local node.
|
||||||
|
%% This function is named 'override' due to historical reasons.
|
||||||
get_override_config_file() ->
|
get_override_config_file() ->
|
||||||
Node = node(),
|
Node = node(),
|
||||||
case emqx_app:get_init_config_load_done() of
|
case emqx_app:get_init_config_load_done() of
|
||||||
|
@ -63,7 +65,7 @@ get_override_config_file() ->
|
||||||
tnx_id => TnxId,
|
tnx_id => TnxId,
|
||||||
node => Node,
|
node => Node,
|
||||||
has_deprecated_file => HasDeprecateFile,
|
has_deprecated_file => HasDeprecateFile,
|
||||||
release => emqx_app:get_release()
|
release => emqx_release:version_with_prefix()
|
||||||
}
|
}
|
||||||
end,
|
end,
|
||||||
case mria:ro_transaction(?CLUSTER_RPC_SHARD, Fun) of
|
case mria:ro_transaction(?CLUSTER_RPC_SHARD, Fun) of
|
||||||
|
@ -95,7 +97,7 @@ init_conf() ->
|
||||||
%% Workaround for https://github.com/emqx/mria/issues/94:
|
%% Workaround for https://github.com/emqx/mria/issues/94:
|
||||||
_ = mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1000),
|
_ = mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1000),
|
||||||
_ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]),
|
_ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]),
|
||||||
{ok, TnxId} = copy_override_conf_from_core_node(),
|
{ok, TnxId} = sync_cluster_conf(),
|
||||||
_ = emqx_app:set_init_tnx_id(TnxId),
|
_ = emqx_app:set_init_tnx_id(TnxId),
|
||||||
ok = init_load(),
|
ok = init_load(),
|
||||||
ok = emqx_app:set_init_config_load_done().
|
ok = emqx_app:set_init_config_load_done().
|
||||||
|
@ -103,88 +105,134 @@ init_conf() ->
|
||||||
cluster_nodes() ->
|
cluster_nodes() ->
|
||||||
mria:cluster_nodes(cores) -- [node()].
|
mria:cluster_nodes(cores) -- [node()].
|
||||||
|
|
||||||
copy_override_conf_from_core_node() ->
|
%% @doc Try to sync the cluster config from other core nodes.
|
||||||
|
sync_cluster_conf() ->
|
||||||
case cluster_nodes() of
|
case cluster_nodes() of
|
||||||
%% The first core nodes is self.
|
|
||||||
[] ->
|
[] ->
|
||||||
?SLOG(debug, #{msg => "skip_copy_override_conf_from_core_node"}),
|
%% The first core nodes is self.
|
||||||
|
?SLOG(debug, #{
|
||||||
|
msg => "skip_sync_cluster_conf", reason => "Running single node"
|
||||||
|
}),
|
||||||
{ok, ?DEFAULT_INIT_TXN_ID};
|
{ok, ?DEFAULT_INIT_TXN_ID};
|
||||||
Nodes ->
|
Nodes ->
|
||||||
{Results, Failed} = emqx_conf_proto_v2:get_override_config_file(Nodes),
|
sync_cluster_conf2(Nodes)
|
||||||
{Ready, NotReady0} = lists:partition(fun(Res) -> element(1, Res) =:= ok end, Results),
|
end.
|
||||||
NotReady = lists:filter(fun(Res) -> element(1, Res) =:= error end, NotReady0),
|
|
||||||
case (Failed =/= [] orelse NotReady =/= []) andalso Ready =/= [] of
|
%% @priv Some core nodes are running, try to sync the cluster config from them.
|
||||||
|
sync_cluster_conf2(Nodes) ->
|
||||||
|
{Results, Failed} = emqx_conf_proto_v2:get_override_config_file(Nodes),
|
||||||
|
{Ready, NotReady0} = lists:partition(fun(Res) -> element(1, Res) =:= ok end, Results),
|
||||||
|
NotReady = lists:filter(fun(Res) -> element(1, Res) =:= error end, NotReady0),
|
||||||
|
case (Failed =/= [] orelse NotReady =/= []) of
|
||||||
|
true when Ready =/= [] ->
|
||||||
|
%% Some core nodes failed to reply.
|
||||||
|
Warning = #{
|
||||||
|
nodes => Nodes,
|
||||||
|
failed => Failed,
|
||||||
|
not_ready => NotReady,
|
||||||
|
msg => "ignored_nodes_when_sync_cluster_conf"
|
||||||
|
},
|
||||||
|
?SLOG(warning, Warning);
|
||||||
|
true ->
|
||||||
|
%% There are core nodes running but no one was able to reply.
|
||||||
|
?SLOG(error, #{
|
||||||
|
msg => "failed_to_sync_cluster_conf",
|
||||||
|
nodes => Nodes,
|
||||||
|
failed => Failed,
|
||||||
|
not_ready => NotReady
|
||||||
|
});
|
||||||
|
false ->
|
||||||
|
ok
|
||||||
|
end,
|
||||||
|
case Ready of
|
||||||
|
[] ->
|
||||||
|
case should_proceed_with_boot() of
|
||||||
true ->
|
true ->
|
||||||
Warning = #{
|
%% Act as if this node is alone, so it can
|
||||||
nodes => Nodes,
|
%% finish the boot sequence and load the
|
||||||
failed => Failed,
|
%% config for other nodes to copy it.
|
||||||
not_ready => NotReady,
|
?SLOG(info, #{
|
||||||
msg => "ignored_bad_nodes_when_copy_init_config"
|
msg => "skip_sync_cluster_conf",
|
||||||
},
|
loading_from_disk => true,
|
||||||
?SLOG(warning, Warning);
|
|
||||||
false ->
|
|
||||||
ok
|
|
||||||
end,
|
|
||||||
case Ready of
|
|
||||||
[] ->
|
|
||||||
%% Other core nodes running but no one replicated it successfully.
|
|
||||||
?SLOG(error, #{
|
|
||||||
msg => "copy_override_conf_from_core_node_failed",
|
|
||||||
nodes => Nodes,
|
nodes => Nodes,
|
||||||
failed => Failed,
|
failed => Failed,
|
||||||
not_ready => NotReady
|
not_ready => NotReady
|
||||||
}),
|
}),
|
||||||
|
{ok, ?DEFAULT_INIT_TXN_ID};
|
||||||
case should_proceed_with_boot() of
|
false ->
|
||||||
true ->
|
%% retry in some time
|
||||||
%% Act as if this node is alone, so it can
|
Jitter = rand:uniform(2000),
|
||||||
%% finish the boot sequence and load the
|
Timeout = 10000 + Jitter,
|
||||||
%% config for other nodes to copy it.
|
timer:sleep(Timeout),
|
||||||
?SLOG(info, #{
|
?SLOG(warning, #{
|
||||||
msg => "skip_copy_override_conf_from_core_node",
|
msg => "sync_cluster_conf_retry",
|
||||||
loading_from_disk => true,
|
timeout => Timeout,
|
||||||
nodes => Nodes,
|
nodes => Nodes,
|
||||||
failed => Failed,
|
failed => Failed,
|
||||||
not_ready => NotReady
|
not_ready => NotReady
|
||||||
}),
|
|
||||||
{ok, ?DEFAULT_INIT_TXN_ID};
|
|
||||||
false ->
|
|
||||||
%% retry in some time
|
|
||||||
Jitter = rand:uniform(2000),
|
|
||||||
Timeout = 10000 + Jitter,
|
|
||||||
?SLOG(info, #{
|
|
||||||
msg => "copy_cluster_conf_from_core_node_retry",
|
|
||||||
timeout => Timeout,
|
|
||||||
nodes => Nodes,
|
|
||||||
failed => Failed,
|
|
||||||
not_ready => NotReady
|
|
||||||
}),
|
|
||||||
timer:sleep(Timeout),
|
|
||||||
copy_override_conf_from_core_node()
|
|
||||||
end;
|
|
||||||
_ ->
|
|
||||||
[{ok, Info} | _] = lists:sort(fun conf_sort/2, Ready),
|
|
||||||
#{node := Node, conf := RawOverrideConf, tnx_id := TnxId} = Info,
|
|
||||||
HasDeprecatedFile = has_deprecated_file(Info),
|
|
||||||
?SLOG(debug, #{
|
|
||||||
msg => "copy_cluster_conf_from_core_node_success",
|
|
||||||
node => Node,
|
|
||||||
has_deprecated_file => HasDeprecatedFile,
|
|
||||||
local_release => emqx_app:get_release(),
|
|
||||||
remote_release => maps:get(release, Info, "before_v5.0.24|e5.0.3"),
|
|
||||||
data_dir => emqx:data_dir(),
|
|
||||||
tnx_id => TnxId
|
|
||||||
}),
|
}),
|
||||||
ok = emqx_config:save_to_override_conf(
|
sync_cluster_conf()
|
||||||
HasDeprecatedFile,
|
end;
|
||||||
RawOverrideConf,
|
_ ->
|
||||||
#{override_to => cluster}
|
sync_cluster_conf3(Ready)
|
||||||
),
|
|
||||||
ok = sync_data_from_node(Node),
|
|
||||||
{ok, TnxId}
|
|
||||||
end
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
%% @priv Filter out the nodes which are running a newer version than this node.
|
||||||
|
sync_cluster_conf3(Ready) ->
|
||||||
|
NotNewer = fun({ok, #{release := RemoteRelease}}) ->
|
||||||
|
try
|
||||||
|
emqx_release:vsn_compare(RemoteRelease) =/= newer
|
||||||
|
catch
|
||||||
|
_:_ ->
|
||||||
|
%% If the version is not valid (without v or e prefix),
|
||||||
|
%% we know it's older than v5.1.0/e5.1.0
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end,
|
||||||
|
case lists:filter(NotNewer, Ready) of
|
||||||
|
[] ->
|
||||||
|
%% All available core nodes are running a newer version than this node.
|
||||||
|
%% Start this node without syncing cluster config from them.
|
||||||
|
%% This is likely a restart of an older version node during cluster upgrade.
|
||||||
|
NodesAndVersions = lists:map(
|
||||||
|
fun({ok, #{node := Node, release := Release}}) ->
|
||||||
|
#{node => Node, version => Release}
|
||||||
|
end,
|
||||||
|
Ready
|
||||||
|
),
|
||||||
|
?SLOG(warning, #{
|
||||||
|
msg => "all_available_nodes_running_newer_version",
|
||||||
|
hint => "Booting this node without syncing cluster config from peer core nodes",
|
||||||
|
peer_nodes => NodesAndVersions
|
||||||
|
}),
|
||||||
|
{ok, ?DEFAULT_INIT_TXN_ID};
|
||||||
|
Ready2 ->
|
||||||
|
sync_cluster_conf4(Ready2)
|
||||||
|
end.
|
||||||
|
|
||||||
|
%% @priv Some core nodes are running and replied with their configs successfully.
|
||||||
|
%% Try to sort the results and save the first one for local use.
|
||||||
|
sync_cluster_conf4(Ready) ->
|
||||||
|
[{ok, Info} | _] = lists:sort(fun conf_sort/2, Ready),
|
||||||
|
#{node := Node, conf := RawOverrideConf, tnx_id := TnxId} = Info,
|
||||||
|
HasDeprecatedFile = has_deprecated_file(Info),
|
||||||
|
?SLOG(debug, #{
|
||||||
|
msg => "sync_cluster_conf_success",
|
||||||
|
synced_from_node => Node,
|
||||||
|
has_deprecated_file => HasDeprecatedFile,
|
||||||
|
local_release => emqx_app:get_release(),
|
||||||
|
remote_release => maps:get(release, Info, "before_v5.0.24|e5.0.3"),
|
||||||
|
data_dir => emqx:data_dir(),
|
||||||
|
tnx_id => TnxId
|
||||||
|
}),
|
||||||
|
ok = emqx_config:save_to_override_conf(
|
||||||
|
HasDeprecatedFile,
|
||||||
|
RawOverrideConf,
|
||||||
|
#{override_to => cluster}
|
||||||
|
),
|
||||||
|
ok = sync_data_from_node(Node),
|
||||||
|
{ok, TnxId}.
|
||||||
|
|
||||||
should_proceed_with_boot() ->
|
should_proceed_with_boot() ->
|
||||||
TablesStatus = emqx_cluster_rpc:get_tables_status(),
|
TablesStatus = emqx_cluster_rpc:get_tables_status(),
|
||||||
LocalNode = node(),
|
LocalNode = node(),
|
||||||
|
|
|
@ -98,6 +98,34 @@ t_copy_deprecated_data_dir(_Config) ->
|
||||||
stop_cluster(Nodes)
|
stop_cluster(Nodes)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
t_no_copy_from_newer_version_node(_Config) ->
|
||||||
|
net_kernel:start(['master2@127.0.0.1', longnames]),
|
||||||
|
ct:timetrap({seconds, 120}),
|
||||||
|
snabbkaffe:fix_ct_logging(),
|
||||||
|
Cluster = cluster([cluster_spec({core, 10}), cluster_spec({core, 11}), cluster_spec({core, 12})]),
|
||||||
|
OKs = [ok, ok, ok],
|
||||||
|
[First | Rest] = Nodes = start_cluster(Cluster),
|
||||||
|
try
|
||||||
|
File = "/configs/cluster.hocon",
|
||||||
|
assert_config_load_done(Nodes),
|
||||||
|
rpc:call(First, ?MODULE, create_data_dir, [File]),
|
||||||
|
{OKs, []} = rpc:multicall(Nodes, application, stop, [emqx_conf]),
|
||||||
|
{OKs, []} = rpc:multicall(Nodes, ?MODULE, set_data_dir_env, []),
|
||||||
|
{OKs, []} = rpc:multicall(Nodes, meck, new, [
|
||||||
|
emqx_release, [passthrough, no_history, no_link, non_strict]
|
||||||
|
]),
|
||||||
|
%% 99.9.9 is always newer than the current version
|
||||||
|
{OKs, []} = rpc:multicall(Nodes, meck, expect, [
|
||||||
|
emqx_release, version_with_prefix, 0, "e99.9.9"
|
||||||
|
]),
|
||||||
|
ok = rpc:call(First, application, start, [emqx_conf]),
|
||||||
|
{[ok, ok], []} = rpc:multicall(Rest, application, start, [emqx_conf]),
|
||||||
|
ok = assert_no_cluster_conf_copied(Rest, File),
|
||||||
|
stop_cluster(Nodes),
|
||||||
|
ok
|
||||||
|
after
|
||||||
|
stop_cluster(Nodes)
|
||||||
|
end.
|
||||||
%%------------------------------------------------------------------------------
|
%%------------------------------------------------------------------------------
|
||||||
%% Helper functions
|
%% Helper functions
|
||||||
%%------------------------------------------------------------------------------
|
%%------------------------------------------------------------------------------
|
||||||
|
@ -158,6 +186,17 @@ assert_data_copy_done([First0 | Rest], File) ->
|
||||||
Rest
|
Rest
|
||||||
).
|
).
|
||||||
|
|
||||||
|
assert_no_cluster_conf_copied([], _) ->
|
||||||
|
ok;
|
||||||
|
assert_no_cluster_conf_copied([Node | Nodes], File) ->
|
||||||
|
NodeStr = atom_to_list(Node),
|
||||||
|
?assertEqual(
|
||||||
|
{error, enoent},
|
||||||
|
file:read_file(NodeStr ++ File),
|
||||||
|
#{node => Node}
|
||||||
|
),
|
||||||
|
assert_no_cluster_conf_copied(Nodes, File).
|
||||||
|
|
||||||
assert_config_load_done(Nodes) ->
|
assert_config_load_done(Nodes) ->
|
||||||
lists:foreach(
|
lists:foreach(
|
||||||
fun(Node) ->
|
fun(Node) ->
|
||||||
|
|
Loading…
Reference in New Issue