Merge pull request #11214 from zhongwencool/fix-cluster-rpc-call-tnx-id

fix: bad tnx-id when rejoin cluster
2023-07-07 15:08:06 +08:00 · 2023-07-07 15:08:06 +08:00 · ab07fd0547
parent a8d96ea36a f7513b900a
commit ab07fd0547
6 changed files with 49 additions and 38 deletions
--- a/apps/emqx/src/emqx_app.erl
+++ b/apps/emqx/src/emqx_app.erl
@ -25,9 +25,7 @@
    get_description/0,
    get_release/0,
    set_config_loader/1,
-    get_config_loader/0,
-    set_init_tnx_id/1,
-    get_init_tnx_id/0
+    get_config_loader/0
 ]).

 -include("logger.hrl").
@ -65,16 +63,6 @@ set_config_loader(Module) when is_atom(Module) ->
 get_config_loader() ->
    application:get_env(emqx, config_loader, emqx).

-%% @doc Set the transaction id from which this node should start applying after boot.
-%% The transaction ID is received from the core node which we just copied the latest
-%% config from.
-set_init_tnx_id(TnxId) ->
-    application:set_env(emqx, cluster_rpc_init_tnx_id, TnxId).
-
-%% @doc Get the transaction id from which this node should start applying after boot.
-get_init_tnx_id() ->
-    application:get_env(emqx, cluster_rpc_init_tnx_id, -1).
-
 maybe_load_config() ->
    case get_config_loader() of
        emqx ->
--- a/apps/emqx_conf/src/emqx_cluster_rpc.erl
+++ b/apps/emqx_conf/src/emqx_cluster_rpc.erl
@ -29,7 +29,9 @@
    status/0,
    skip_failed_commit/1,
    fast_forward_to_commit/2,
-    on_mria_stop/1
+    on_mria_stop/1,
+    wait_for_cluster_rpc/0,
+    maybe_init_tnx_id/2
 ]).
 -export([
    commit/2,
@ -59,16 +61,17 @@

 -export_type([tnx_id/0, succeed_num/0]).

-ifdef(TEST).
-compile(export_all).
-compile(nowarn_export_all).
-endif.
-
 -boot_mnesia({mnesia, [boot]}).

 -include_lib("emqx/include/logger.hrl").
 -include("emqx_conf.hrl").

+-ifdef(TEST).
+-compile(export_all).
+-compile(nowarn_export_all).
+
+-endif.
+
 -define(INITIATE(MFA), {initiate, MFA}).
 -define(CATCH_UP, catch_up).
 -define(TIMEOUT, timer:minutes(1)).
@ -276,6 +279,20 @@ on_mria_stop(leave) ->
 on_mria_stop(_) ->
    ok.

+wait_for_cluster_rpc() ->
+    %% Workaround for https://github.com/emqx/mria/issues/94:
+    Msg1 = #{msg => "wait_for_cluster_rpc_shard"},
+    case mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1500) of
+        ok -> ?SLOG(info, Msg1#{result => ok});
+        Error0 -> ?SLOG(error, Msg1#{result => Error0})
+    end,
+    Msg2 = #{msg => "wait_for_cluster_rpc_tables"},
+    case mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]) of
+        ok -> ?SLOG(info, Msg2#{result => ok});
+        Error1 -> ?SLOG(error, Msg2#{result => Error1})
+    end,
+    ok.
+
 %%%===================================================================
 %%% gen_server callbacks
 %%%===================================================================
@ -285,20 +302,19 @@ init([Node, RetryMs]) ->
    register_mria_stop_cb(fun ?MODULE:on_mria_stop/1),
    {ok, _} = mnesia:subscribe({table, ?CLUSTER_MFA, simple}),
    State = #{node => Node, retry_interval => RetryMs, is_leaving => false},
-    %% The init transaction ID is set in emqx_conf_app after
-    %% it has fetched the latest config from one of the core nodes
-    TnxId = emqx_app:get_init_tnx_id(),
-    ok = maybe_init_tnx_id(Node, TnxId),
    %% Now continue with the normal catch-up process
    %% That is: apply the missing transactions after the config
    %% was copied until now.
-    {ok, State, {continue, ?CATCH_UP}}.
+    {ok, State, {continue, {?CATCH_UP, init}}}.

 %% @private
-handle_continue(?CATCH_UP, State) ->
+handle_continue({?CATCH_UP, init}, State) ->
    %% emqx app must be started before
    %% trying to catch up the rpc commit logs
    ok = wait_for_emqx_ready(),
+    ok = wait_for_cluster_rpc(),
+    {noreply, State, catch_up(State)};
+handle_continue(?CATCH_UP, State) ->
    {noreply, State, catch_up(State)}.

 handle_call(reset, _From, State) ->
@ -388,7 +404,8 @@ read_next_mfa(Node) ->
                }),
                TnxId;
            [#cluster_rpc_commit{tnx_id = LastAppliedID}] ->
-                LastAppliedID + 1
+                OldestId = get_oldest_mfa_id(),
+                max(LastAppliedID + 1, OldestId)
        end,
    case mnesia:read(?CLUSTER_MFA, NextId) of
        [] -> caught_up;
@ -404,8 +421,7 @@ do_fast_forward_to_commit(ToTnxId, State = #{node := Node}) ->
        true ->
            NodeId;
        false ->
-            {atomic, LatestId} = transaction(fun ?MODULE:get_cluster_tnx_id/0, []),
-            case LatestId =< NodeId of
+            case latest_tnx_id() =< NodeId of
                true ->
                    NodeId;
                false ->
@ -420,6 +436,12 @@ get_cluster_tnx_id() ->
        Id -> Id
    end.

+get_oldest_mfa_id() ->
+    case mnesia:first(?CLUSTER_MFA) of
+        '$end_of_table' -> 0;
+        Id -> Id
+    end.
+
 %% The entry point of a config change transaction.
 init_mfa(Node, MFA) ->
    mnesia:write_lock_table(?CLUSTER_MFA),
--- a/apps/emqx_conf/src/emqx_conf.app.src
+++ b/apps/emqx_conf/src/emqx_conf.app.src
@ -1,6 +1,6 @@
 {application, emqx_conf, [
    {description, "EMQX configuration management"},
-    {vsn, "0.1.23"},
+    {vsn, "0.1.24"},
    {registered, []},
    {mod, {emqx_conf_app, []}},
    {applications, [kernel, stdlib, emqx_ctl]},
--- a/apps/emqx_conf/src/emqx_conf_app.erl
+++ b/apps/emqx_conf/src/emqx_conf_app.erl
@ -93,10 +93,12 @@ sync_data_from_node() ->
 %% Internal functions
 %% ------------------------------------------------------------------------------

-init_load() ->
+init_load(TnxId) ->
    case emqx_app:get_config_loader() of
        Module when Module == emqx; Module == emqx_conf ->
            ok = emqx_config:init_load(emqx_conf:schema_module()),
+            %% Set load config done after update(init) tnx_id.
+            ok = emqx_cluster_rpc:maybe_init_tnx_id(node(), TnxId),
            ok = emqx_app:set_config_loader(emqx_conf),
            ok;
        Module ->
@ -112,12 +114,10 @@ init_load_done() ->
    emqx_app:get_config_loader() =/= emqx.

 init_conf() ->
-    %% Workaround for https://github.com/emqx/mria/issues/94:
-    _ = mria_rlog:wait_for_shards([?CLUSTER_RPC_SHARD], 1000),
-    _ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]),
+    emqx_cluster_rpc:wait_for_cluster_rpc(),
    {ok, TnxId} = sync_cluster_conf(),
-    _ = emqx_app:set_init_tnx_id(TnxId),
-    ok = init_load().
+    ok = init_load(TnxId),
+    ok.

 cluster_nodes() ->
    mria:cluster_nodes(cores) -- [node()].
--- a/apps/emqx_conf/src/emqx_conf_schema.erl
+++ b/apps/emqx_conf/src/emqx_conf_schema.erl
@ -684,10 +684,10 @@ fields("cluster_call") ->
            )},
        {"max_history",
            sc(
-                range(1, 500),
+                range(100, 10240),
                #{
                    desc => ?DESC(cluster_call_max_history),
-                    default => 100
+                    default => 1024
                }
            )},
        {"cleanup_interval",
@ -695,7 +695,7 @@ fields("cluster_call") ->
                emqx_schema:duration(),
                #{
                    desc => ?DESC(cluster_call_cleanup_interval),
-                    default => <<"5m">>
+                    default => <<"24h">>
                }
            )}
    ];
--- a/changes/ce/fix-11214.en.md
+++ b/changes/ce/fix-11214.en.md
@ -0,0 +1 @@
+Fix a bug where node configuration may fail to synchronize correctly when joining the cluster.
				`@ -0,0 +1 @@`
				`Fix a bug where node configuration may fail to synchronize correctly when joining the cluster.`