feat: replace gen_statem by gen_server

2021-08-25 16:38:01 +08:00 · 2021-08-25 16:38:01 +08:00 · 4528508620
parent 60c1c4edba
commit 4528508620
2 changed files with 69 additions and 108 deletions
--- a/apps/emqx_machine/src/emqx_cluster_rpc.erl
+++ b/apps/emqx_machine/src/emqx_cluster_rpc.erl
@ -14,14 +14,14 @@
 %% limitations under the License.
 %%--------------------------------------------------------------------
 -module(emqx_cluster_rpc).
-behaviour(gen_statem).
+-behaviour(gen_server).

 %% API
 -export([start_link/0, mnesia/1]).
 -export([multicall/3, multicall/4, query/1, reset/0, status/0]).

-export([init/1, format_status/2, handle_event/4, terminate/3,
-    code_change/4, callback_mode/0]).
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
+    handle_continue/2, code_change/3]).

 -ifdef(TEST).
 -compile(export_all).
@ -40,8 +40,7 @@
 -rlog_shard({?COMMON_SHARD, ?CLUSTER_COMMIT}).

 -define(CATCH_UP, catch_up).
-define(REALTIME, realtime).
-define(CATCH_UP_AFTER(_Ms_), {state_timeout, _Ms_, catch_up_delay}).
+-define(TIMEOUT, timer:minutes(1)).

 %%%===================================================================
 %%% API
@ -49,14 +48,14 @@
 mnesia(boot) ->
    ok = ekka_mnesia:create_table(?CLUSTER_MFA, [
        {type, ordered_set},
+        {rlog_shard, ?COMMON_SHARD},
        {disc_copies, [node()]},
-        {local_content, true},
        {record_name, cluster_rpc_mfa},
        {attributes, record_info(fields, cluster_rpc_mfa)}]),
    ok = ekka_mnesia:create_table(?CLUSTER_COMMIT, [
        {type, set},
+        {rlog_shard, ?COMMON_SHARD},
        {disc_copies, [node()]},
-        {local_content, true},
        {record_name, cluster_rpc_commit},
        {attributes, record_info(fields, cluster_rpc_commit)}]);
 mnesia(copy) ->
@ -66,15 +65,16 @@ mnesia(copy) ->
 start_link() ->
    RetryMs = application:get_env(emqx_machine, cluster_call_retry_interval, 1000),
    start_link(node(), ?MODULE, RetryMs).
+
 start_link(Node, Name, RetryMs) ->
-    gen_statem:start_link({local, Name}, ?MODULE, [Node, RetryMs], []).
+    gen_server:start_link({local, Name}, ?MODULE, [Node, RetryMs], []).

 -spec multicall(Module, Function, Args) -> {ok, TnxId} | {error, Reason} when
    Module :: module(),
    Function :: atom(),
    Args :: [term()],
    TnxId :: pos_integer(),
-    Reason :: term().
+    Reason :: string().
 multicall(M, F, A) ->
    multicall(M, F, A, timer:minutes(2)).

@ -84,14 +84,17 @@ multicall(M, F, A) ->
    Args :: [term()],
    TnxId :: pos_integer(),
    Timeout :: timeout(),
-    Reason :: term().
+    Reason :: string().
 multicall(M, F, A, Timeout) ->
    MFA = {initiate, {M, F, A}},
    case ekka_rlog:role() of
-        core ->  gen_statem:call(?MODULE, MFA, Timeout);
+        core -> gen_server:call(?MODULE, MFA, Timeout);
        replicant ->
+            %% the initiate transaction must happened on core node
+            %% make sure MFA(in the transaction) and the transaction on the same node
+            %% don't need rpc again inside transaction.
            case ekka_rlog_status:upstream_node(?COMMON_SHARD) of
-                {ok, Node} -> gen_statem:call({?MODULE, Node}, MFA, Timeout);
+                {ok, Node} -> gen_server:call({?MODULE, Node}, MFA, Timeout);
                disconnected -> {error, disconnected}
            end
    end.
@ -101,7 +104,7 @@ query(TnxId) ->
    transaction(fun trans_query/1, [TnxId]).

 -spec reset() -> reset.
-reset() -> gen_statem:call(?MODULE, reset).
+reset() -> gen_server:call(?MODULE, reset).

 -spec status() -> {'atomic', [map()]} | {'aborted', Reason :: term()}.
 status() ->
@ -114,75 +117,67 @@ status() ->
 %% @private
 init([Node, RetryMs]) ->
    {ok, _} = mnesia:subscribe({table, ?CLUSTER_MFA, simple}),
-    {ok, ?CATCH_UP, #{node => Node, retry_interval => RetryMs}, ?CATCH_UP_AFTER(0)}.
-
-callback_mode() ->
-    handle_event_function.
+    {ok, #{node => Node, retry_interval => RetryMs}, {continue, ?CATCH_UP}}.

 %% @private
-format_status(Opt, [_PDict, StateName, Data]) ->
-    #{state => StateName, data => Data , opt => Opt}.
+handle_continue(?CATCH_UP, State) ->
+    {noreply, State, catch_up(State)}.

-%% @private
-handle_event(state_timeout, catch_up_delay, _State, Data) ->
-    catch_up(Data);
-
-handle_event(info, {mnesia_table_event, {write, #cluster_rpc_mfa{} = MFARec, _AId}}, ?REALTIME, Data) ->
-    handle_mfa_write_event(MFARec, Data);
-handle_event(info, {mnesia_table_event, {write, #cluster_rpc_mfa{}, _ActivityId}}, ?CATCH_UP, _Data) ->
-    {keep_state_and_data, [postpone, ?CATCH_UP_AFTER(0)]};
-%% we should catch up as soon as possible when we reset all.
-handle_event(info, {mnesia_table_event, {delete,{schema, ?CLUSTER_MFA}, _Tid}}, _, _Data) ->
-    {keep_state_and_data, [?CATCH_UP_AFTER(0)]};
-
-handle_event({call, From}, reset, _State, _Data) ->
+handle_call(reset, _From, State) ->
    _ = ekka_mnesia:clear_table(?CLUSTER_COMMIT),
    _ = ekka_mnesia:clear_table(?CLUSTER_MFA),
-    {keep_state_and_data, [{reply, From, ok}, ?CATCH_UP_AFTER(0)]};
+    {reply, ok, State, {continue, ?CATCH_UP}};

-handle_event({call, From}, {initiate, MFA}, ?REALTIME, Data = #{node := Node}) ->
+handle_call({initiate, MFA}, _From, State = #{node := Node}) ->
    case transaction(fun init_mfa/2, [Node, MFA]) of
        {atomic, {ok, TnxId}} ->
-            {keep_state, Data, [{reply, From, {ok, TnxId}}]};
+            {reply, {ok, TnxId}, State, {continue, ?CATCH_UP}};
        {aborted, Reason} ->
-            {keep_state, Data, [{reply, From, {error, Reason}}]}
-    end;
-handle_event({call, From}, {initiate, _MFA}, ?CATCH_UP, Data = #{retry_interval := RetryMs}) ->
-    case catch_up(Data) of
-        {next_state, ?REALTIME, Data} ->
-            {next_state, ?REALTIME, Data, [{postpone, true}]};
-        _ ->
-            Reason = "There are still transactions that have not been executed.",
-            {keep_state_and_data, [{reply, From, {error, Reason}}, ?CATCH_UP_AFTER(RetryMs)]}
+            {reply, {error, Reason}, State, {continue, ?CATCH_UP}}
    end;
+handle_call(_, _From, State) ->
+    {reply, ok, State, catch_up(State)}.

-handle_event(_EventType, _EventContent, ?CATCH_UP, #{retry_interval := RetryMs}) ->
-    {keep_state_and_data, [?CATCH_UP_AFTER(RetryMs)]};
-handle_event(_EventType, _EventContent, _StateName, _Data) ->
-    keep_state_and_data.
+handle_cast(_, State) ->
+    {noreply, State, catch_up(State)}.

-terminate(_Reason, _StateName, _Data) ->
+handle_info({mnesia_table_event, _}, State) ->
+    {noreply, State, catch_up(State)};
+handle_info(_, State) ->
+    {noreply, State, catch_up(State)}.
+
+terminate(_Reason, _Data) ->
    ok.

-code_change(_OldVsn, StateName, Data, _Extra) ->
-    {ok, StateName, Data}.
+code_change(_OldVsn, State, _Extra) ->
+    {ok, State}.

 %%%===================================================================
 %%% Internal functions
 %%%===================================================================
-catch_up(#{node := Node, retry_interval := RetryMs} = Data) ->
+catch_up(#{node := Node, retry_interval := RetryMs} = State) ->
    case transaction(fun get_next_mfa/1, [Node]) of
-        {atomic, caught_up} -> {next_state, ?REALTIME, Data};
+        {atomic, caught_up} -> ?TIMEOUT;
        {atomic, {still_lagging, NextId, MFA}} ->
            case apply_mfa(NextId, MFA) of
                ok ->
                    case transaction(fun commit/2, [Node, NextId]) of
-                        {atomic, ok} -> catch_up(Data);
-                        _ -> {next_state, ?CATCH_UP, Data, [?CATCH_UP_AFTER(RetryMs)]}
+                        {atomic, ok} -> catch_up(State);
+                        Error ->
+                            ?SLOG(error, #{
+                                msg => "mnesia write transaction failed",
+                                node => Node,
+                                nextId => NextId,
+                                error => Error}),
+                            RetryMs
                    end;
-                _ -> {next_state, ?CATCH_UP, Data, [?CATCH_UP_AFTER(RetryMs)]}
+                _Error -> RetryMs
            end;
-        {aborted, _Reason} -> {next_state, ?CATCH_UP, Data, [?CATCH_UP_AFTER(RetryMs)]}
+        {aborted, Reason} ->
+            ?SLOG(error, #{
+                msg => "get_next_mfa transaction failed",
+                node => Node, error => Reason}),
+            RetryMs
    end.

 get_next_mfa(Node) ->
@ -192,7 +187,9 @@ get_next_mfa(Node) ->
                LatestId = get_latest_id(),
                TnxId = max(LatestId - 1, 0),
                commit(Node, TnxId),
-                ?LOG(notice, "New node(~p) first catch up and start commit at ~p", [Node, TnxId]),
+                ?SLOG(notice, #{
+                    msg => "New node first catch up and start commit.",
+                    node => Node, tnx_id => TnxId}),
                TnxId;
            [#cluster_rpc_commit{tnx_id = LastAppliedID}] -> LastAppliedID + 1
        end,
@ -216,10 +213,15 @@ do_catch_up(ToTnxId, Node) ->
                {error, Reason} -> mnesia:abort(Reason);
                Other -> mnesia:abort(Other)
            end;
-        [#cluster_rpc_commit{tnx_id = LastAppliedId}]  ->
+        [#cluster_rpc_commit{tnx_id = LastAppliedId}] ->
            Reason = lists:flatten(io_lib:format("~p catch up failed by LastAppliedId(~p) > ToTnxId(~p)",
                [Node, LastAppliedId, ToTnxId])),
-            ?LOG(error, Reason),
+            ?SLOG(error, #{
+                msg => "catch up failed!",
+                last_applied_id => LastAppliedId,
+                node => Node,
+                to_tnx_id => ToTnxId
+            }),
            {error, Reason}
    end.

@ -232,35 +234,6 @@ get_latest_id() ->
        Id -> Id
    end.

-handle_mfa_write_event(#cluster_rpc_mfa{tnx_id = EventId, mfa = MFA}, Data) ->
-    #{node := Node, retry_interval := RetryMs} = Data,
-    {atomic, LastAppliedId} = transaction(fun get_last_applied_id/2, [Node, EventId - 1]),
-    if LastAppliedId + 1 =:= EventId ->
-        case apply_mfa(EventId, MFA) of
-            ok ->
-                case transaction(fun commit/2, [Node, EventId]) of
-                    {atomic, ok} ->
-                        {next_state, ?REALTIME, Data};
-                    _ -> {next_state, ?CATCH_UP, Data, [?CATCH_UP_AFTER(RetryMs)]}
-                end;
-            _ -> {next_state, ?CATCH_UP, Data, [?CATCH_UP_AFTER(RetryMs)]}
-        end;
-        LastAppliedId >= EventId -> %% It's means the initiator receive self event or other receive stale event.
-            keep_state_and_data;
-        true ->
-            ?LOG(error, "LastAppliedId+1<EventId, maybe the mnesia event'order is messed up! restart process:~p",
-                [{LastAppliedId, EventId, MFA, Node}]),
-            {stop, {"LastAppliedId+1<EventId", {LastAppliedId, EventId, MFA, Node}}}
-    end.
-
-get_last_applied_id(Node, Default) ->
-    case mnesia:wread({?CLUSTER_COMMIT, Node}) of
-        [#cluster_rpc_commit{tnx_id = TnxId}] -> TnxId;
-        [] ->
-            commit(Node, Default),
-            Default
-    end.
-
 init_mfa(Node, MFA) ->
    mnesia:write_lock_table(?CLUSTER_MFA),
    LatestId = get_latest_id(),
@ -311,12 +284,12 @@ trans_query(TnxId) ->
 apply_mfa(TnxId, {M, F, A} = MFA) ->
    try
        Res = erlang:apply(M, F, A),
-            case Res =:= ok of
-                true ->
-                    ?SLOG(notice, #{msg => "succeeded to apply MFA", tnx_id => TnxId, mfa => MFA, result => ok});
-                false ->
-                    ?SLOG(error, #{msg => "failed to apply MFA", tnx_id => TnxId, mfa => MFA, result => Res})
-                end,
+        case Res =:= ok of
+            true ->
+                ?SLOG(notice, #{msg => "succeeded to apply MFA", tnx_id => TnxId, mfa => MFA, result => ok});
+            false ->
+                ?SLOG(error, #{msg => "failed to apply MFA", tnx_id => TnxId, mfa => MFA, result => Res})
+        end,
        Res
    catch
        C : E ->
--- a/apps/emqx_machine/test/emqx_cluster_rpc_SUITE.erl
+++ b/apps/emqx_machine/test/emqx_cluster_rpc_SUITE.erl
@ -78,8 +78,6 @@ t_base_test(_Config) ->
    ?assertEqual(node(), maps:get(initiator, Query)),
    ?assert(maps:is_key(created_at, Query)),
    ?assertEqual(ok, receive_msg(3, test)),
-    SysStatus = lists:last(lists:last(element(4,sys:get_status(?NODE1)))),
-    ?assertEqual(#{data => #{node => node(),retry_interval => 900}, opt => normal, state => realtime}, SysStatus),
    sleep(400),
    {atomic, Status} = emqx_cluster_rpc:status(),
    ?assertEqual(3, length(Status)),
@ -111,12 +109,9 @@ t_commit_ok_but_apply_fail_on_other_node(_Config) ->
    {atomic, [Status]} = emqx_cluster_rpc:status(),
    ?assertEqual(MFA, maps:get(mfa, Status)),
    ?assertEqual(node(), maps:get(node, Status)),
-    ?assertEqual(realtime, element(1, sys:get_state(?NODE1))),
-    ?assertEqual(catch_up, element(1, sys:get_state(?NODE2))),
-    ?assertEqual(catch_up, element(1, sys:get_state(?NODE3))),
    erlang:send(?NODE2, test),
    Res = gen_statem:call(?NODE2,  {initiate, {M, F, A}}),
-    ?assertEqual({error, "There are still transactions that have not been executed."}, Res),
+    ?assertEqual({error, "MFA return not ok"}, Res),
    ok.

 t_catch_up_status_handle_next_commit(_Config) ->
@ -124,7 +119,6 @@ t_catch_up_status_handle_next_commit(_Config) ->
    {atomic, []} = emqx_cluster_rpc:status(),
    {M, F, A} = {?MODULE, failed_on_node_by_odd, [erlang:whereis(?NODE1)]},
    {ok, _} = emqx_cluster_rpc:multicall(M, F, A),
-    ?assertEqual(catch_up, element(1, sys:get_state(?NODE2))),
    {ok, 2} = gen_statem:call(?NODE2,  {initiate, {M, F, A}}),
    ok.

@ -139,17 +133,11 @@ t_commit_ok_apply_fail_on_other_node_then_recover(_Config) ->
    ?assertEqual([], L),
    ?assertEqual({io, format, ["test"]}, maps:get(mfa, Status)),
    ?assertEqual(node(), maps:get(node, Status)),
-    ?assertEqual(realtime, element(1, sys:get_state(?NODE1))),
-    ?assertEqual(catch_up, element(1, sys:get_state(?NODE2))),
-    ?assertEqual(catch_up, element(1, sys:get_state(?NODE3))),
    sleep(4000),
    {atomic, [Status1]} = emqx_cluster_rpc:status(),
    ?assertEqual(Status, Status1),
    sleep(1600),
    {atomic, NewStatus} = emqx_cluster_rpc:status(),
-    ?assertEqual(realtime, element(1, sys:get_state(?NODE1))),
-    ?assertEqual(realtime, element(1, sys:get_state(?NODE2))),
-    ?assertEqual(realtime, element(1, sys:get_state(?NODE3))),
    ?assertEqual(3, length(NewStatus)),
    Pid = self(),
    MFAEcho = {M1, F1, A1} = {?MODULE, echo, [Pid, test]},