Merge pull request #11214 from zhongwencool/fix-cluster-rpc-call-tnx-id

fix: bad tnx-id when rejoin cluster
This commit is contained in:
zhongwencool 2023-07-07 15:08:06 +08:00 committed by GitHub
commit ab07fd0547
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 49 additions and 38 deletions

View File

@ -25,9 +25,7 @@
get_description/0,
get_release/0,
set_config_loader/1,
get_config_loader/0,
set_init_tnx_id/1,
get_init_tnx_id/0
get_config_loader/0
]).
-include("logger.hrl").
@ -65,16 +63,6 @@ set_config_loader(Module) when is_atom(Module) ->
get_config_loader() ->
application:get_env(emqx, config_loader, emqx).
%% @doc Set the transaction id from which this node should start applying after boot.
%% The transaction ID is received from the core node which we just copied the latest
%% config from.
set_init_tnx_id(TnxId) ->
application:set_env(emqx, cluster_rpc_init_tnx_id, TnxId).
%% @doc Get the transaction id from which this node should start applying after boot.
get_init_tnx_id() ->
application:get_env(emqx, cluster_rpc_init_tnx_id, -1).
maybe_load_config() ->
case get_config_loader() of
emqx ->

View File

@ -29,7 +29,9 @@
status/0,
skip_failed_commit/1,
fast_forward_to_commit/2,
on_mria_stop/1
on_mria_stop/1,
wait_for_cluster_rpc/0,
maybe_init_tnx_id/2
]).
-export([
commit/2,
@ -59,16 +61,17 @@
-export_type([tnx_id/0, succeed_num/0]).
-ifdef(TEST).
-compile(export_all).
-compile(nowarn_export_all).
-endif.
-boot_mnesia({mnesia, [boot]}).
-include_lib("emqx/include/logger.hrl").
-include("emqx_conf.hrl").
-ifdef(TEST).
-compile(export_all).
-compile(nowarn_export_all).
-endif.
-define(INITIATE(MFA), {initiate, MFA}).
-define(CATCH_UP, catch_up).
-define(TIMEOUT, timer:minutes(1)).
@ -276,6 +279,20 @@ on_mria_stop(leave) ->
on_mria_stop(_) ->
ok.
wait_for_cluster_rpc() ->
%% Workaround for https://github.com/emqx/mria/issues/94:
Msg1 = #{msg => "wait_for_cluster_rpc_shard"},
case mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1500) of
ok -> ?SLOG(info, Msg1#{result => ok});
Error0 -> ?SLOG(error, Msg1#{result => Error0})
end,
Msg2 = #{msg => "wait_for_cluster_rpc_tables"},
case mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]) of
ok -> ?SLOG(info, Msg2#{result => ok});
Error1 -> ?SLOG(error, Msg2#{result => Error1})
end,
ok.
%%%===================================================================
%%% gen_server callbacks
%%%===================================================================
@ -285,20 +302,19 @@ init([Node, RetryMs]) ->
register_mria_stop_cb(fun ?MODULE:on_mria_stop/1),
{ok, _} = mnesia:subscribe({table, ?CLUSTER_MFA, simple}),
State = #{node => Node, retry_interval => RetryMs, is_leaving => false},
%% The init transaction ID is set in emqx_conf_app after
%% it has fetched the latest config from one of the core nodes
TnxId = emqx_app:get_init_tnx_id(),
ok = maybe_init_tnx_id(Node, TnxId),
%% Now continue with the normal catch-up process
%% That is: apply the missing transactions after the config
%% was copied until now.
{ok, State, {continue, ?CATCH_UP}}.
{ok, State, {continue, {?CATCH_UP, init}}}.
%% @private
handle_continue(?CATCH_UP, State) ->
handle_continue({?CATCH_UP, init}, State) ->
%% emqx app must be started before
%% trying to catch up the rpc commit logs
ok = wait_for_emqx_ready(),
ok = wait_for_cluster_rpc(),
{noreply, State, catch_up(State)};
handle_continue(?CATCH_UP, State) ->
{noreply, State, catch_up(State)}.
handle_call(reset, _From, State) ->
@ -388,7 +404,8 @@ read_next_mfa(Node) ->
}),
TnxId;
[#cluster_rpc_commit{tnx_id = LastAppliedID}] ->
LastAppliedID + 1
OldestId = get_oldest_mfa_id(),
max(LastAppliedID + 1, OldestId)
end,
case mnesia:read(?CLUSTER_MFA, NextId) of
[] -> caught_up;
@ -404,8 +421,7 @@ do_fast_forward_to_commit(ToTnxId, State = #{node := Node}) ->
true ->
NodeId;
false ->
{atomic, LatestId} = transaction(fun ?MODULE:get_cluster_tnx_id/0, []),
case LatestId =< NodeId of
case latest_tnx_id() =< NodeId of
true ->
NodeId;
false ->
@ -420,6 +436,12 @@ get_cluster_tnx_id() ->
Id -> Id
end.
get_oldest_mfa_id() ->
case mnesia:first(?CLUSTER_MFA) of
'$end_of_table' -> 0;
Id -> Id
end.
%% The entry point of a config change transaction.
init_mfa(Node, MFA) ->
mnesia:write_lock_table(?CLUSTER_MFA),

View File

@ -1,6 +1,6 @@
{application, emqx_conf, [
{description, "EMQX configuration management"},
{vsn, "0.1.23"},
{vsn, "0.1.24"},
{registered, []},
{mod, {emqx_conf_app, []}},
{applications, [kernel, stdlib, emqx_ctl]},

View File

@ -93,10 +93,12 @@ sync_data_from_node() ->
%% Internal functions
%% ------------------------------------------------------------------------------
init_load() ->
init_load(TnxId) ->
case emqx_app:get_config_loader() of
Module when Module == emqx; Module == emqx_conf ->
ok = emqx_config:init_load(emqx_conf:schema_module()),
%% Set load config done after update(init) tnx_id.
ok = emqx_cluster_rpc:maybe_init_tnx_id(node(), TnxId),
ok = emqx_app:set_config_loader(emqx_conf),
ok;
Module ->
@ -112,12 +114,10 @@ init_load_done() ->
emqx_app:get_config_loader() =/= emqx.
init_conf() ->
%% Workaround for https://github.com/emqx/mria/issues/94:
_ = mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1000),
_ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]),
emqx_cluster_rpc:wait_for_cluster_rpc(),
{ok, TnxId} = sync_cluster_conf(),
_ = emqx_app:set_init_tnx_id(TnxId),
ok = init_load().
ok = init_load(TnxId),
ok.
cluster_nodes() ->
mria:cluster_nodes(cores) -- [node()].

View File

@ -684,10 +684,10 @@ fields("cluster_call") ->
)},
{"max_history",
sc(
range(1, 500),
range(100, 10240),
#{
desc => ?DESC(cluster_call_max_history),
default => 100
default => 1024
}
)},
{"cleanup_interval",
@ -695,7 +695,7 @@ fields("cluster_call") ->
emqx_schema:duration(),
#{
desc => ?DESC(cluster_call_cleanup_interval),
default => <<"5m">>
default => <<"24h">>
}
)}
];

View File

@ -0,0 +1 @@
Fix a bug where node configuration may fail to synchronize correctly when joining the cluster.