fix(clusterlink): communicate bootstrap requirements via actor handshake

`session_present` flag is not reliable to decide whether bootstrap is needed if durable sessions is enabled.
In this case, the client session may survive cluster restart while all the external routes are lost, as they are not persistent.
This commit is contained in:
Serge Tupchii 2024-05-28 17:55:36 +03:00
parent d5e82cdfac
commit 21711c6e0d
4 changed files with 46 additions and 31 deletions

View File

@ -170,10 +170,7 @@ actor_init(
case MyClusterName of case MyClusterName of
TargetCluster -> TargetCluster ->
Env = #{timestamp => erlang:system_time(millisecond)}, Env = #{timestamp => erlang:system_time(millisecond)},
{ok, _} = emqx_cluster_link_extrouter:actor_init( emqx_cluster_link_extrouter:actor_init(ClusterName, Actor, Incr, Env);
ClusterName, Actor, Incr, Env
),
ok;
_ -> _ ->
%% The remote cluster uses a different name to refer to this cluster %% The remote cluster uses a different name to refer to this cluster
?SLOG(error, #{ ?SLOG(error, #{

View File

@ -19,7 +19,8 @@
actor_state/3, actor_state/3,
actor_apply_operation/2, actor_apply_operation/2,
actor_apply_operation/3, actor_apply_operation/3,
actor_gc/1 actor_gc/1,
is_present_incarnation/1
]). ]).
%% Internal API %% Internal API
@ -140,7 +141,8 @@ match_to_route(M) ->
cluster :: cluster(), cluster :: cluster(),
actor :: actor(), actor :: actor(),
incarnation :: incarnation(), incarnation :: incarnation(),
lane :: lane() | undefined lane :: lane() | undefined,
extra :: map()
}). }).
-type state() :: #state{}. -type state() :: #state{}.
@ -159,6 +161,12 @@ actor_init(Cluster, Actor, Incarnation, Env = #{timestamp := Now}) ->
actor_init(Cluster, Actor, Incarnation, Env) actor_init(Cluster, Actor, Incarnation, Env)
end. end.
-spec is_present_incarnation(state()) -> boolean().
is_present_incarnation(#state{extra = #{is_present_incarnation := IsNew}}) ->
IsNew;
is_present_incarnation(_State) ->
false.
mnesia_actor_init(Cluster, Actor, Incarnation, TS) -> mnesia_actor_init(Cluster, Actor, Incarnation, TS) ->
%% NOTE %% NOTE
%% We perform this heavy-weight transaction only in the case of a new route %% We perform this heavy-weight transaction only in the case of a new route
@ -173,7 +181,7 @@ mnesia_actor_init(Cluster, Actor, Incarnation, TS) ->
case mnesia:read(?EXTROUTE_ACTOR_TAB, ActorID, write) of case mnesia:read(?EXTROUTE_ACTOR_TAB, ActorID, write) of
[#actor{incarnation = Incarnation, lane = Lane} = Rec] -> [#actor{incarnation = Incarnation, lane = Lane} = Rec] ->
ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write), ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec#actor{until = bump_actor_ttl(TS)}, write),
{ok, State#state{lane = Lane}}; {ok, State#state{lane = Lane, extra = #{is_present_incarnation => true}}};
[] -> [] ->
Lane = mnesia_assign_lane(Cluster), Lane = mnesia_assign_lane(Cluster),
Rec = #actor{ Rec = #actor{
@ -183,7 +191,7 @@ mnesia_actor_init(Cluster, Actor, Incarnation, TS) ->
until = bump_actor_ttl(TS) until = bump_actor_ttl(TS)
}, },
ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec, write), ok = mnesia:write(?EXTROUTE_ACTOR_TAB, Rec, write),
{ok, State#state{lane = Lane}}; {ok, State#state{lane = Lane, extra = #{is_present_incarnation => false}}};
[#actor{incarnation = Outdated} = Rec] when Incarnation > Outdated -> [#actor{incarnation = Outdated} = Rec] when Incarnation > Outdated ->
{reincarnate, Rec}; {reincarnate, Rec};
[#actor{incarnation = Newer}] -> [#actor{incarnation = Newer}] ->
@ -321,7 +329,7 @@ mnesia_clean_incarnation(#actor{id = Actor, incarnation = Incarnation, lane = La
clean_lane(Lane) -> clean_lane(Lane) ->
ets:foldl( ets:foldl(
fun(#extroute{entry = Entry, mcounter = MCounter}, _) -> fun(#extroute{entry = Entry, mcounter = MCounter}, _) ->
apply_operation(Entry, MCounter, del, Lane) apply_operation(Entry, MCounter, delete, Lane)
end, end,
0, 0,
?EXTROUTE_TAB ?EXTROUTE_TAB

View File

@ -71,6 +71,7 @@
-define(F_TARGET_CLUSTER, 13). -define(F_TARGET_CLUSTER, 13).
-define(F_PROTO_VER, 14). -define(F_PROTO_VER, 14).
-define(F_RESULT, 15). -define(F_RESULT, 15).
-define(F_NEED_BOOTSTRAP, 16).
-define(ROUTE_DELETE, 100). -define(ROUTE_DELETE, 100).
@ -279,18 +280,29 @@ actor_init_ack_resp_msg(Actor, InitRes, ReqId, RespTopic) ->
Payload = #{ Payload = #{
?F_OPERATION => ?OP_ACTOR_INIT_ACK, ?F_OPERATION => ?OP_ACTOR_INIT_ACK,
?F_PROTO_VER => ?PROTO_VER, ?F_PROTO_VER => ?PROTO_VER,
?F_ACTOR => Actor, ?F_ACTOR => Actor
?F_RESULT => InitRes
}, },
Payload1 = with_res_and_bootstrap(Payload, InitRes),
emqx_message:make( emqx_message:make(
undefined, undefined,
?QOS_1, ?QOS_1,
RespTopic, RespTopic,
?ENCODE(Payload), ?ENCODE(Payload1),
#{}, #{},
#{properties => #{'Correlation-Data' => ReqId}} #{properties => #{'Correlation-Data' => ReqId}}
). ).
with_res_and_bootstrap(Payload, {ok, ActorState}) ->
Payload#{
?F_RESULT => ok,
?F_NEED_BOOTSTRAP => not emqx_cluster_link_extrouter:is_present_incarnation(ActorState)
};
with_res_and_bootstrap(Payload, Error) ->
Payload#{
?F_RESULT => Error,
?F_NEED_BOOTSTRAP => false
}.
publish_route_sync(ClientPid, Actor, Incarnation, Updates) -> publish_route_sync(ClientPid, Actor, Incarnation, Updates) ->
PubTopic = ?ROUTE_TOPIC, PubTopic = ?ROUTE_TOPIC,
Payload = #{ Payload = #{
@ -339,9 +351,12 @@ decode_resp1(#{
?F_OPERATION := ?OP_ACTOR_INIT_ACK, ?F_OPERATION := ?OP_ACTOR_INIT_ACK,
?F_ACTOR := Actor, ?F_ACTOR := Actor,
?F_PROTO_VER := ProtoVer, ?F_PROTO_VER := ProtoVer,
?F_RESULT := InitResult ?F_RESULT := InitResult,
?F_NEED_BOOTSTRAP := NeedBootstrap
}) -> }) ->
{actor_init_ack, #{actor => Actor, result => InitResult, proto_ver => ProtoVer}}. {actor_init_ack, #{
actor => Actor, result => InitResult, proto_ver => ProtoVer, need_bootstrap => NeedBootstrap
}}.
decode_forwarded_msg(Payload) -> decode_forwarded_msg(Payload) ->
case ?DECODE(Payload) of case ?DECODE(Payload) of

View File

@ -164,14 +164,6 @@ refine_client_options(Options = #{clientid := ClientID}, Actor) ->
retry_interval => 0 retry_interval => 0
}. }.
client_session_present(ClientPid) ->
Info = emqtt:info(ClientPid),
%% FIXME: waitnig for emqtt release that fixes session_present type (must be a boolean)
case proplists:get_value(session_present, Info, 0) of
0 -> false;
1 -> true
end.
announce_client(Actor, TargetCluster, Pid) -> announce_client(Actor, TargetCluster, Pid) ->
Name = Name =
case Actor of case Actor of
@ -334,13 +326,15 @@ handle_info(
{publish, #{payload := Payload, properties := #{'Correlation-Data' := ReqId}}}, {publish, #{payload := Payload, properties := #{'Correlation-Data' := ReqId}}},
St = #st{actor_init_req_id = ReqId} St = #st{actor_init_req_id = ReqId}
) -> ) ->
{actor_init_ack, #{result := Res} = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp(Payload), {actor_init_ack, #{result := Res, need_bootstrap := NeedBootstrap} = AckInfoMap} = emqx_cluster_link_mqtt:decode_resp(
Payload
),
St1 = St#st{ St1 = St#st{
actor_init_req_id = undefined, actor_init_timer = undefined, remote_actor_info = AckInfoMap actor_init_req_id = undefined, actor_init_timer = undefined, remote_actor_info = AckInfoMap
}, },
case Res of case Res of
ok -> ok ->
{noreply, post_actor_init(St1)}; {noreply, post_actor_init(St1, NeedBootstrap)};
Error -> Error ->
?SLOG(error, #{ ?SLOG(error, #{
msg => "failed_to_init_link", msg => "failed_to_init_link",
@ -410,10 +404,11 @@ init_remote_actor(
St#st{actor_init_req_id = ReqId, actor_init_timer = TRef}. St#st{actor_init_req_id = ReqId, actor_init_timer = TRef}.
post_actor_init( post_actor_init(
St = #st{client = ClientPid, target = TargetCluster, actor = Actor, incarnation = Incr} St = #st{client = ClientPid, target = TargetCluster, actor = Actor, incarnation = Incr},
NeedBootstrap
) -> ) ->
ok = start_syncer(TargetCluster, Actor, Incr), ok = start_syncer(TargetCluster, Actor, Incr),
process_bootstrap(St#st{client = ClientPid}). process_bootstrap(St#st{client = ClientPid}, NeedBootstrap).
handle_connect_error(_Reason, St) -> handle_connect_error(_Reason, St) ->
%% TODO: logs %% TODO: logs
@ -426,14 +421,14 @@ handle_client_down(_Reason, St = #st{target = TargetCluster, actor = Actor}) ->
ok = close_syncer(TargetCluster, Actor), ok = close_syncer(TargetCluster, Actor),
process_connect(St#st{client = undefined}). process_connect(St#st{client = undefined}).
process_bootstrap(St = #st{bootstrapped = false}) -> process_bootstrap(St = #st{bootstrapped = false}, _NeedBootstrap) ->
run_bootstrap(St); run_bootstrap(St);
process_bootstrap(St = #st{client = ClientPid, bootstrapped = true}) -> process_bootstrap(St = #st{bootstrapped = true}, NeedBootstrap) ->
case client_session_present(ClientPid) of case NeedBootstrap of
true -> true ->
process_bootstrapped(St); run_bootstrap(St);
false -> false ->
run_bootstrap(St) process_bootstrapped(St)
end. end.
%% Bootstrapping. %% Bootstrapping.