refactor(cm): force kill the proc that failed to response takeover call
port from: #7026
This commit is contained in:
parent
d35ff9303a
commit
9c1fe4336b
|
@ -262,6 +262,14 @@ open_session(true, ClientInfo = #{clientid := ClientId}, ConnInfo) ->
|
||||||
open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) ->
|
open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) ->
|
||||||
Self = self(),
|
Self = self(),
|
||||||
ResumeStart = fun(_) ->
|
ResumeStart = fun(_) ->
|
||||||
|
CreateSess =
|
||||||
|
fun() ->
|
||||||
|
Session = create_session(ClientInfo, ConnInfo),
|
||||||
|
Session1 = emqx_persistent_session:persist(
|
||||||
|
ClientInfo,ConnInfo, Session),
|
||||||
|
register_channel(ClientId, Self, ConnInfo),
|
||||||
|
{ok, #{session => Session1, present => false}}
|
||||||
|
end,
|
||||||
case takeover_session(ClientId) of
|
case takeover_session(ClientId) of
|
||||||
{persistent, Session} ->
|
{persistent, Session} ->
|
||||||
%% This is a persistent session without a managing process.
|
%% This is a persistent session without a managing process.
|
||||||
|
@ -274,15 +282,20 @@ open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) ->
|
||||||
pendings => Pendings}};
|
pendings => Pendings}};
|
||||||
{living, ConnMod, ChanPid, Session} ->
|
{living, ConnMod, ChanPid, Session} ->
|
||||||
ok = emqx_session:resume(ClientInfo, Session),
|
ok = emqx_session:resume(ClientInfo, Session),
|
||||||
Session1 = emqx_persistent_session:persist( ClientInfo
|
case request_stepdown(
|
||||||
, ConnInfo
|
{takeover, 'end'},
|
||||||
, Session
|
ConnMod,
|
||||||
),
|
ChanPid) of
|
||||||
Pendings = ConnMod:call(ChanPid, {takeover, 'end'}, ?T_TAKEOVER),
|
{ok, Pendings} ->
|
||||||
register_channel(ClientId, Self, ConnInfo),
|
Session1 = emqx_persistent_session:persist(
|
||||||
{ok, #{session => Session1,
|
ClientInfo, ConnInfo, Session),
|
||||||
present => true,
|
register_channel(ClientId, Self, ConnInfo),
|
||||||
pendings => Pendings}};
|
{ok, #{session => Session1,
|
||||||
|
present => true,
|
||||||
|
pendings => Pendings}};
|
||||||
|
{error, _} ->
|
||||||
|
CreateSess()
|
||||||
|
end;
|
||||||
{expired, OldSession} ->
|
{expired, OldSession} ->
|
||||||
_ = emqx_persistent_session:discard(ClientId, OldSession),
|
_ = emqx_persistent_session:discard(ClientId, OldSession),
|
||||||
Session = create_session(ClientInfo, ConnInfo),
|
Session = create_session(ClientInfo, ConnInfo),
|
||||||
|
@ -293,13 +306,7 @@ open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) ->
|
||||||
register_channel(ClientId, Self, ConnInfo),
|
register_channel(ClientId, Self, ConnInfo),
|
||||||
{ok, #{session => Session1, present => false}};
|
{ok, #{session => Session1, present => false}};
|
||||||
none ->
|
none ->
|
||||||
Session = create_session(ClientInfo, ConnInfo),
|
CreateSess()
|
||||||
Session1 = emqx_persistent_session:persist( ClientInfo
|
|
||||||
, ConnInfo
|
|
||||||
, Session
|
|
||||||
),
|
|
||||||
register_channel(ClientId, Self, ConnInfo),
|
|
||||||
{ok, #{session => Session1, present => false}}
|
|
||||||
end
|
end
|
||||||
end,
|
end,
|
||||||
emqx_cm_locker:trans(ClientId, ResumeStart).
|
emqx_cm_locker:trans(ClientId, ResumeStart).
|
||||||
|
@ -359,9 +366,9 @@ takeover_session(ClientId) ->
|
||||||
takeover_session(ClientId, Pid) ->
|
takeover_session(ClientId, Pid) ->
|
||||||
try do_takeover_session(ClientId, Pid)
|
try do_takeover_session(ClientId, Pid)
|
||||||
catch
|
catch
|
||||||
_ : noproc -> % emqx_ws_connection: call
|
_ : R when R == noproc;
|
||||||
emqx_persistent_session:lookup(ClientId);
|
R == timeout;
|
||||||
_ : {noproc, _} -> % emqx_connection: gen_server:call
|
R == unexpected_exception -> %% request_stepdown/3
|
||||||
emqx_persistent_session:lookup(ClientId);
|
emqx_persistent_session:lookup(ClientId);
|
||||||
_ : {'EXIT', {noproc, _}} -> % rpc_call/3
|
_ : {'EXIT', {noproc, _}} -> % rpc_call/3
|
||||||
emqx_persistent_session:lookup(ClientId)
|
emqx_persistent_session:lookup(ClientId)
|
||||||
|
@ -372,9 +379,12 @@ do_takeover_session(ClientId, ChanPid) when node(ChanPid) == node() ->
|
||||||
undefined ->
|
undefined ->
|
||||||
emqx_persistent_session:lookup(ClientId);
|
emqx_persistent_session:lookup(ClientId);
|
||||||
ConnMod when is_atom(ConnMod) ->
|
ConnMod when is_atom(ConnMod) ->
|
||||||
%% TODO: if takeover times out, maybe kill the old?
|
case request_stepdown({takeover, 'begin'}, ConnMod, ChanPid) of
|
||||||
Session = ConnMod:call(ChanPid, {takeover, 'begin'}, ?T_TAKEOVER),
|
{ok, Session} ->
|
||||||
{living, ConnMod, ChanPid, Session}
|
{living, ConnMod, ChanPid, Session};
|
||||||
|
{error, Reason} ->
|
||||||
|
error(Reason)
|
||||||
|
end
|
||||||
end;
|
end;
|
||||||
do_takeover_session(ClientId, ChanPid) ->
|
do_takeover_session(ClientId, ChanPid) ->
|
||||||
wrap_rpc(emqx_cm_proto_v1:takeover_session(ClientId, ChanPid)).
|
wrap_rpc(emqx_cm_proto_v1:takeover_session(ClientId, ChanPid)).
|
||||||
|
@ -391,31 +401,52 @@ discard_session(ClientId) when is_binary(ClientId) ->
|
||||||
%% If failed to kick (e.g. timeout) force a kill.
|
%% If failed to kick (e.g. timeout) force a kill.
|
||||||
%% Keeping the stale pid around, or returning error or raise an exception
|
%% Keeping the stale pid around, or returning error or raise an exception
|
||||||
%% benefits nobody.
|
%% benefits nobody.
|
||||||
-spec kick_or_kill(kick | discard, module(), pid()) -> ok.
|
-spec request_stepdown(Action, module(), pid())
|
||||||
kick_or_kill(Action, ConnMod, Pid) ->
|
-> ok
|
||||||
try
|
| {ok, emqx_session:session() | list(emqx_type:deliver())}
|
||||||
|
| {error, term()}
|
||||||
|
when Action :: kick | discard | {takeover, 'begin'} | {takeover, 'end'}.
|
||||||
|
request_stepdown(Action, ConnMod, Pid) ->
|
||||||
|
Timeout =
|
||||||
|
case Action == kick orelse Action == discard of
|
||||||
|
true -> ?T_KICK;
|
||||||
|
_ -> ?T_TAKEOVER
|
||||||
|
end,
|
||||||
|
Return =
|
||||||
%% this is essentially a gen_server:call implemented in emqx_connection
|
%% this is essentially a gen_server:call implemented in emqx_connection
|
||||||
%% and emqx_ws_connection.
|
%% and emqx_ws_connection.
|
||||||
%% the handle_call is implemented in emqx_channel
|
%% the handle_call is implemented in emqx_channel
|
||||||
ok = apply(ConnMod, call, [Pid, Action, ?T_KICK])
|
try apply(ConnMod, call, [Pid, Action, Timeout]) of
|
||||||
catch
|
ok -> ok;
|
||||||
_ : noproc -> % emqx_ws_connection: call
|
Reply -> {ok, Reply}
|
||||||
ok = ?tp(debug, "session_already_gone", #{pid => Pid, action => Action});
|
catch
|
||||||
_ : {noproc, _} -> % emqx_connection: gen_server:call
|
_ : noproc -> % emqx_ws_connection: call
|
||||||
ok = ?tp(debug, "session_already_gone", #{pid => Pid, action => Action});
|
ok = ?tp(debug, "session_already_gone", #{pid => Pid, action => Action}),
|
||||||
_ : {shutdown, _} ->
|
{error, noproc};
|
||||||
ok = ?tp(debug, "session_already_shutdown", #{pid => Pid, action => Action});
|
_ : {noproc, _} -> % emqx_connection: gen_server:call
|
||||||
_ : {{shutdown, _}, _} ->
|
ok = ?tp(debug, "session_already_gone", #{pid => Pid, action => Action}),
|
||||||
ok = ?tp(debug, "session_already_shutdown", #{pid => Pid, action => Action});
|
{error, noproc};
|
||||||
_ : {timeout, {gen_server, call, _}} ->
|
_ : {shutdown, _} ->
|
||||||
?tp(warning, "session_kick_timeout",
|
ok = ?tp(debug, "session_already_shutdown", #{pid => Pid, action => Action}),
|
||||||
#{pid => Pid, action => Action, stale_channel => stale_channel_info(Pid)}),
|
{error, noproc};
|
||||||
ok = force_kill(Pid);
|
_ : {{shutdown, _}, _} ->
|
||||||
_ : Error : St ->
|
ok = ?tp(debug, "session_already_shutdown", #{pid => Pid, action => Action}),
|
||||||
?tp(error, "session_kick_exception",
|
{error, noproc};
|
||||||
#{pid => Pid, action => Action, reason => Error, stacktrace => St,
|
_ : {timeout, {gen_server, call, _}} ->
|
||||||
stale_channel => stale_channel_info(Pid)}),
|
?tp(warning, "session_stepdown_request_timeout",
|
||||||
ok = force_kill(Pid)
|
#{pid => Pid, action => Action, stale_channel => stale_channel_info(Pid)}),
|
||||||
|
ok = force_kill(Pid),
|
||||||
|
{error, timeout};
|
||||||
|
_ : Error : St ->
|
||||||
|
?tp(error, "session_stepdown_request_exception",
|
||||||
|
#{pid => Pid, action => Action, reason => Error, stacktrace => St,
|
||||||
|
stale_channel => stale_channel_info(Pid)}),
|
||||||
|
ok = force_kill(Pid),
|
||||||
|
{error, unexpected_exception}
|
||||||
|
end,
|
||||||
|
case Action == kick orelse Action == discard of
|
||||||
|
true -> ok;
|
||||||
|
_ -> Return
|
||||||
end.
|
end.
|
||||||
|
|
||||||
force_kill(Pid) ->
|
force_kill(Pid) ->
|
||||||
|
@ -438,7 +469,7 @@ do_kick_session(Action, ClientId, ChanPid) ->
|
||||||
%% already deregistered
|
%% already deregistered
|
||||||
ok;
|
ok;
|
||||||
ConnMod when is_atom(ConnMod) ->
|
ConnMod when is_atom(ConnMod) ->
|
||||||
ok = kick_or_kill(Action, ConnMod, ChanPid)
|
ok = request_stepdown(Action, ConnMod, ChanPid)
|
||||||
end.
|
end.
|
||||||
|
|
||||||
%% @private This function is shared for session 'kick' and 'discard' (as the first arg Action).
|
%% @private This function is shared for session 'kick' and 'discard' (as the first arg Action).
|
||||||
|
|
|
@ -190,45 +190,77 @@ t_open_session_race_condition(_) ->
|
||||||
?assertEqual([], emqx_cm:lookup_channels(ClientId)).
|
?assertEqual([], emqx_cm:lookup_channels(ClientId)).
|
||||||
|
|
||||||
t_kick_session_discard_normal(_) ->
|
t_kick_session_discard_normal(_) ->
|
||||||
test_kick_session(discard, normal).
|
test_stepdown_session(discard, normal).
|
||||||
|
|
||||||
t_kick_session_discard_shutdown(_) ->
|
t_kick_session_discard_shutdown(_) ->
|
||||||
test_kick_session(discard, shutdown).
|
test_stepdown_session(discard, shutdown).
|
||||||
|
|
||||||
t_kick_session_discard_shutdown_with_reason(_) ->
|
t_kick_session_discard_shutdown_with_reason(_) ->
|
||||||
test_kick_session(discard, {shutdown, discard}).
|
test_stepdown_session(discard, {shutdown, discard}).
|
||||||
|
|
||||||
t_kick_session_discard_timeout(_) ->
|
t_kick_session_discard_timeout(_) ->
|
||||||
test_kick_session(discard, timeout).
|
test_stepdown_session(discard, timeout).
|
||||||
|
|
||||||
t_kick_session_discard_noproc(_) ->
|
t_kick_session_discard_noproc(_) ->
|
||||||
test_kick_session(discard, noproc).
|
test_stepdown_session(discard, noproc).
|
||||||
|
|
||||||
t_kick_session_kick_normal(_) ->
|
t_kick_session_kick_normal(_) ->
|
||||||
test_kick_session(discard, normal).
|
test_stepdown_session(kick, normal).
|
||||||
|
|
||||||
t_kick_session_kick_shutdown(_) ->
|
t_kick_session_kick_shutdown(_) ->
|
||||||
test_kick_session(discard, shutdown).
|
test_stepdown_session(kick, shutdown).
|
||||||
|
|
||||||
t_kick_session_kick_shutdown_with_reason(_) ->
|
t_kick_session_kick_shutdown_with_reason(_) ->
|
||||||
test_kick_session(discard, {shutdown, discard}).
|
test_stepdown_session(kick, {shutdown, kicked}).
|
||||||
|
|
||||||
t_kick_session_kick_timeout(_) ->
|
t_kick_session_kick_timeout(_) ->
|
||||||
test_kick_session(discard, timeout).
|
test_stepdown_session(kick, timeout).
|
||||||
|
|
||||||
t_kick_session_kick_noproc(_) ->
|
t_kick_session_kick_noproc(_) ->
|
||||||
test_kick_session(discard, noproc).
|
test_stepdown_session(kick, noproc).
|
||||||
|
|
||||||
test_kick_session(Action, Reason) ->
|
t_stepdown_session_takeover_begin_normal(_) ->
|
||||||
|
test_stepdown_session({takeover, 'begin'}, normal).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_begin_shutdown(_) ->
|
||||||
|
test_stepdown_session({takeover, 'begin'}, shutdown).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_begin_shutdown_with_reason(_) ->
|
||||||
|
test_stepdown_session({takeover, 'begin'}, {shutdown, kicked}).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_begin_timeout(_) ->
|
||||||
|
test_stepdown_session({takeover, 'begin'}, timeout).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_begin_noproc(_) ->
|
||||||
|
test_stepdown_session({takeover, 'begin'}, noproc).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_end_normal(_) ->
|
||||||
|
test_stepdown_session({takeover, 'end'}, normal).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_end_shutdown(_) ->
|
||||||
|
test_stepdown_session({takeover, 'end'}, shutdown).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_end_shutdown_with_reason(_) ->
|
||||||
|
test_stepdown_session({takeover, 'end'}, {shutdown, kicked}).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_end_timeout(_) ->
|
||||||
|
test_stepdown_session({takeover, 'end'}, timeout).
|
||||||
|
|
||||||
|
t_stepdown_session_takeover_end_noproc(_) ->
|
||||||
|
test_stepdown_session({takeover, 'end'}, noproc).
|
||||||
|
|
||||||
|
test_stepdown_session(Action, Reason) ->
|
||||||
ClientId = rand_client_id(),
|
ClientId = rand_client_id(),
|
||||||
#{conninfo := ConnInfo} = ?ChanInfo,
|
#{conninfo := ConnInfo} = ?ChanInfo,
|
||||||
FakeSessionFun =
|
FakeSessionFun =
|
||||||
fun Loop() ->
|
fun Loop() ->
|
||||||
receive
|
receive
|
||||||
{'$gen_call', From, A} when A =:= kick orelse
|
{'$gen_call', From, A} when A =:= kick orelse
|
||||||
A =:= discard ->
|
A =:= discard orelse
|
||||||
|
A =:= {takeover, 'begin'} orelse
|
||||||
|
A =:= {takeover, 'end'} ->
|
||||||
case Reason of
|
case Reason of
|
||||||
normal ->
|
normal when A =:= kick orelse A =:= discard ->
|
||||||
gen_server:reply(From, ok);
|
gen_server:reply(From, ok);
|
||||||
timeout ->
|
timeout ->
|
||||||
%% no response to the call
|
%% no response to the call
|
||||||
|
@ -253,7 +285,8 @@ test_kick_session(Action, Reason) ->
|
||||||
end,
|
end,
|
||||||
ok = case Action of
|
ok = case Action of
|
||||||
kick -> emqx_cm:kick_session(ClientId);
|
kick -> emqx_cm:kick_session(ClientId);
|
||||||
discard -> emqx_cm:discard_session(ClientId)
|
discard -> emqx_cm:discard_session(ClientId);
|
||||||
|
{takeover, _} -> none = emqx_cm:takeover_session(ClientId), ok
|
||||||
end,
|
end,
|
||||||
case Reason =:= timeout orelse Reason =:= noproc of
|
case Reason =:= timeout orelse Reason =:= noproc of
|
||||||
true ->
|
true ->
|
||||||
|
|
Loading…
Reference in New Issue