fix(live_conn): fix double-decrement for anonymous clients

`set_chan_info` does contain the `conn_state`, but it seems that it is
not called for anonymous clients disconnecting.

`emqx_channel:handle_info({sock_closed, normal}, _, _)` is called with
`conn_state := connected` (which decrements by calling
`emqx_channel:ensure_disconnected` and does not call `set_chan_info`),
and afterwards `emqx_cm` receives a `DOWN` message from the same chan.

When the `DOWN` message is received, we check `?CHAN_INFO_TAB` for
that key and check if the persisted state is `connected`. Only in that
case we decrement during `DOWN` messages.

Testing manually, it is still being persisted as `connected`, hence
the double-decrease.

We attempt to solve this by using the ETS as a set with `{ClientID,
ChanPid}` as the key to resolve duplication.
This commit is contained in:
Thales Macedo Garitezi 2021-11-08 11:25:55 -03:00
parent 8a4e0a3ecb
commit 0955122468
No known key found for this signature in database
GPG Key ID: DD279F8152A9B6DD
6 changed files with 114 additions and 46 deletions

View File

@ -1536,7 +1536,9 @@ ensure_connected(Channel = #channel{conninfo = ConnInfo,
clientinfo = ClientInfo}) ->
NConnInfo = ConnInfo#{connected_at => erlang:system_time(millisecond)},
ok = run_hooks('client.connected', [ClientInfo, NConnInfo]),
emqx_cm:increment_connected_client_count(),
ClientID = info(clientid, Channel),
Chan = {ClientID, self()},
emqx_cm:increment_connected_client_count(Chan),
Channel#channel{conninfo = NConnInfo,
conn_state = connected
}.
@ -1625,7 +1627,9 @@ ensure_disconnected(Reason, Channel = #channel{conninfo = ConnInfo,
clientinfo = ClientInfo}) ->
NConnInfo = ConnInfo#{disconnected_at => erlang:system_time(millisecond)},
ok = run_hooks('client.disconnected', [ClientInfo, Reason, NConnInfo]),
emqx_cm:decrement_connected_client_count(),
ClientID = info(clientid, Channel),
Chan = {ClientID, self()},
emqx_cm:decrement_connected_client_count(Chan),
Channel#channel{conninfo = NConnInfo, conn_state = disconnected}.
%%--------------------------------------------------------------------

View File

@ -75,8 +75,8 @@
%% Internal export
-export([ stats_fun/0
, clean_down/1
, increment_connected_client_count/0
, decrement_connected_client_count/0
, increment_connected_client_count/1
, decrement_connected_client_count/1
, get_connected_client_count/0
]).
@ -86,18 +86,15 @@
-define(CHAN_TAB, emqx_channel).
-define(CHAN_CONN_TAB, emqx_channel_conn).
-define(CHAN_INFO_TAB, emqx_channel_info).
-define(CONN_CLIENT_TAB, connected_client_counter).
-define(CHAN_STATS,
[{?CHAN_TAB, 'channels.count', 'channels.max'},
{?CHAN_TAB, 'sessions.count', 'sessions.max'},
{?CHAN_CONN_TAB, 'connections.count', 'connections.max'}
{?CHAN_CONN_TAB, 'connections.count', 'connections.max'},
{?CONN_CLIENT_TAB, 'live_connections.count', 'live_connections.max'}
]).
-define(CONN_CLIENT_TAB, connected_client_counter).
-define(CONN_CLIENT_TAB_KEY, connected_client_count).
-define(CONN_CLIENT_TAB_IDX, 2).
-define(CONNECTED_CLIENT_STATS, {'live_connections.count', 'live_connections.max'}).
%% Batch drain
-define(BATCH_SIZE, 100000).
@ -448,8 +445,7 @@ init([]) ->
ok = emqx_tables:new(?CHAN_TAB, [bag, {read_concurrency, true}|TabOpts]),
ok = emqx_tables:new(?CHAN_CONN_TAB, [bag | TabOpts]),
ok = emqx_tables:new(?CHAN_INFO_TAB, [set, compressed | TabOpts]),
ok = emqx_tables:new(?CONN_CLIENT_TAB, [set | TabOpts]),
true = ets:insert(?CONN_CLIENT_TAB, {?CONN_CLIENT_TAB_KEY, 0}),
ok = emqx_tables:new(?CONN_CLIENT_TAB, [set, {write_concurrency, true} | TabOpts]),
ok = emqx_stats:update_interval(chan_stats, fun ?MODULE:stats_fun/0),
State = #{chan_pmon => emqx_pmon:new()},
{ok, State}.
@ -466,6 +462,7 @@ handle_cast(Msg, State) ->
{noreply, State}.
handle_info({'DOWN', _MRef, process, Pid, _Reason}, State = #{chan_pmon := PMon}) ->
?tp(emqx_cm_process_down, #{pid => Pid, reason => _Reason}),
ChanPids = [Pid | emqx_misc:drain_down(?BATCH_SIZE)],
{Items, PMon1} = emqx_pmon:erase_all(ChanPids, PMon),
lists:foreach(
@ -473,7 +470,7 @@ handle_info({'DOWN', _MRef, process, Pid, _Reason}, State = #{chan_pmon := PMon}
Chan = {ClientID, ChanPid},
case ets:lookup(?CHAN_INFO_TAB, Chan) of
[{Chan, #{conn_state := connected}, _}] ->
decrement_connected_client_count();
decrement_connected_client_count(Chan);
_ ->
ok
end
@ -499,8 +496,7 @@ clean_down({ChanPid, ClientId}) ->
do_unregister_channel({ClientId, ChanPid}).
stats_fun() ->
lists:foreach(fun update_stats/1, ?CHAN_STATS),
update_connected_client_stats().
lists:foreach(fun update_stats/1, ?CHAN_STATS).
update_stats({Tab, Stat, MaxStat}) ->
case ets:info(Tab, size) of
@ -508,12 +504,6 @@ update_stats({Tab, Stat, MaxStat}) ->
Size -> emqx_stats:setstat(Stat, MaxStat, Size)
end.
update_connected_client_stats() ->
{CountStat, MaxStat} = ?CONNECTED_CLIENT_STATS,
CCCount = get_connected_client_count(),
emqx_stats:setstat(CountStat, MaxStat, CCCount),
ok.
get_chann_conn_mod(ClientId, ChanPid) when node(ChanPid) == node() ->
Chan = {ClientId, ChanPid},
try [ConnMod] = ets:lookup_element(?CHAN_CONN_TAB, Chan, 2), ConnMod
@ -523,19 +513,18 @@ get_chann_conn_mod(ClientId, ChanPid) when node(ChanPid) == node() ->
get_chann_conn_mod(ClientId, ChanPid) ->
rpc_call(node(ChanPid), get_chann_conn_mod, [ClientId, ChanPid], ?T_GET_INFO).
increment_connected_client_count() ->
increment_connected_client_count(Chan) ->
?tp(emqx_cm_connected_client_count_inc, #{}),
ets:update_counter(?CONN_CLIENT_TAB, ?CONN_CLIENT_TAB_KEY,
{?CONN_CLIENT_TAB_IDX, 1}),
ets:insert_new(?CONN_CLIENT_TAB, {Chan, true}),
ok.
decrement_connected_client_count() ->
decrement_connected_client_count(Chan) ->
?tp(emqx_cm_connected_client_count_dec, #{}),
Threshold = 0,
SetValue = 0,
ets:update_counter(?CONN_CLIENT_TAB, ?CONN_CLIENT_TAB_KEY,
{?CONN_CLIENT_TAB_IDX, -1, Threshold, SetValue}),
ets:delete(?CONN_CLIENT_TAB, Chan),
ok.
get_connected_client_count() ->
ets:lookup_element(?CONN_CLIENT_TAB, ?CONN_CLIENT_TAB_KEY, ?CONN_CLIENT_TAB_IDX).
case ets:info(?CONN_CLIENT_TAB, size) of
undefined -> 0;
Size -> Size
end.

View File

@ -36,6 +36,7 @@ all() ->
groups() ->
TCs = emqx_ct:all(?MODULE),
ConnClientTCs = [ t_connected_client_count_persistent
, t_connected_client_count_anonymous
, t_connected_client_stats
],
OtherTCs = TCs -- ConnClientTCs,
@ -319,8 +320,8 @@ t_stats_fun({'end', _Config}) ->
ok = emqx_broker:unsubscribe(<<"topic">>),
ok = emqx_broker:unsubscribe(<<"topic2">>).
%% persistent sessions, when gone, do not contribute to connect client
%% count
%% persistent sessions, when gone, do not contribute to connected
%% client count
t_connected_client_count_persistent({init, Config}) ->
ok = snabbkaffe:start_trace(),
process_flag(trap_exit, true),
@ -366,17 +367,90 @@ t_connected_client_count_persistent(Config) when is_list(Config) ->
),
?assertEqual(1, emqx_cm:get_connected_client_count()),
%% abnormal exit of channel process
[ChanPid] = emqx_cm:all_channels(),
{true, {ok, [_]}} = wait_for_events(
fun() -> exit(ChanPid, kill) end,
[emqx_cm_connected_client_count_dec]
),
ChanPids = emqx_cm:all_channels(),
{ok, {ok, [_, _]}} = wait_for_events(
fun() ->
lists:foreach(
fun(ChanPid) -> exit(ChanPid, kill) end,
ChanPids)
end,
[ emqx_cm_connected_client_count_dec
, emqx_cm_process_down
]
),
?assertEqual(0, emqx_cm:get_connected_client_count()),
ok;
t_connected_client_count_persistent({'end', _Config}) ->
snabbkaffe:stop(),
ok.
%% connections without client_id also contribute to connected client
%% count
t_connected_client_count_anonymous({init, Config}) ->
ok = snabbkaffe:start_trace(),
process_flag(trap_exit, true),
Config;
t_connected_client_count_anonymous(Config) when is_list(Config) ->
ConnFun = ?config(conn_fun, Config),
?assertEqual(0, emqx_cm:get_connected_client_count()),
%% first client
{ok, ConnPid0} = emqtt:start_link([ {clean_start, true}
| Config]),
{{ok, _}, {ok, [_]}} = wait_for_events(
fun() -> emqtt:ConnFun(ConnPid0) end,
[emqx_cm_connected_client_count_inc]
),
?assertEqual(1, emqx_cm:get_connected_client_count()),
%% second client
{ok, ConnPid1} = emqtt:start_link([ {clean_start, true}
| Config]),
{{ok, _}, {ok, [_]}} = wait_for_events(
fun() -> emqtt:ConnFun(ConnPid1) end,
[emqx_cm_connected_client_count_inc]
),
?assertEqual(2, emqx_cm:get_connected_client_count()),
%% when first client disconnects, shouldn't affect the second
{ok, {ok, [_, _]}} = wait_for_events(
fun() -> emqtt:disconnect(ConnPid0) end,
[ emqx_cm_connected_client_count_dec
, emqx_cm_process_down
]
),
?assertEqual(1, emqx_cm:get_connected_client_count()),
%% reconnecting
{ok, ConnPid2} = emqtt:start_link([ {clean_start, true}
| Config
]),
{{ok, _}, {ok, [_]}} = wait_for_events(
fun() -> emqtt:ConnFun(ConnPid2) end,
[emqx_cm_connected_client_count_inc]
),
?assertEqual(2, emqx_cm:get_connected_client_count()),
{ok, {ok, [_, _]}} = wait_for_events(
fun() -> emqtt:disconnect(ConnPid1) end,
[ emqx_cm_connected_client_count_dec
, emqx_cm_process_down
]
),
?assertEqual(1, emqx_cm:get_connected_client_count()),
%% abnormal exit of channel process
Chans = emqx_cm:all_channels(),
{ok, {ok, [_, _]}} = wait_for_events(
fun() ->
lists:foreach(
fun(ChanPid) -> exit(ChanPid, kill) end,
Chans)
end,
[ emqx_cm_connected_client_count_dec
, emqx_cm_process_down
]
),
?assertEqual(0, emqx_cm:get_connected_client_count()),
ok;
t_connected_client_count_anonymous({'end', _Config}) ->
snabbkaffe:stop(),
ok.
t_connected_client_stats({init, Config}) ->
ok = supervisor:terminate_child(emqx_kernel_sup, emqx_stats),
{ok, _} = supervisor:restart_child(emqx_kernel_sup, emqx_stats),
@ -422,16 +496,17 @@ t_connected_client_stats({'end', _Config}) ->
{ok, _} = supervisor:restart_child(emqx_kernel_sup, emqx_stats),
ok.
%% alwayseven if
%% the count must be always non negative
t_connect_client_never_negative({init, Config}) ->
Config;
t_connect_client_never_negative(Config) when is_list(Config) ->
?assertEqual(0, emqx_cm:get_connected_client_count()),
%% would go to -1
emqx_cm:decrement_connected_client_count(),
Chan = {<<"clientid">>, list_to_pid("<0.0.1>")},
emqx_cm:decrement_connected_client_count(Chan),
?assertEqual(0, emqx_cm:get_connected_client_count()),
%% would be 0, if really went to -1
emqx_cm:increment_connected_client_count(),
emqx_cm:increment_connected_client_count(Chan),
?assertEqual(1, emqx_cm:get_connected_client_count()),
ok;
t_connect_client_never_negative({'end', _Config}) ->

View File

@ -33,8 +33,8 @@ all() -> emqx_ct:all(?MODULE).
init_per_suite(Config) ->
%% CM Meck
ok = meck:new(emqx_cm, [passthrough, no_history, no_link]),
ok = meck:expect(emqx_cm, increment_connected_client_count, fun() -> ok end),
ok = meck:expect(emqx_cm, decrement_connected_client_count, fun() -> ok end),
ok = meck:expect(emqx_cm, increment_connected_client_count, fun(_) -> ok end),
ok = meck:expect(emqx_cm, decrement_connected_client_count, fun(_) -> ok end),
%% Access Control Meck
ok = meck:new(emqx_access_control, [passthrough, no_history, no_link]),
ok = meck:expect(emqx_access_control, authenticate,

View File

@ -36,8 +36,8 @@ init_per_suite(Config) ->
ok = meck:new(emqx_channel, [passthrough, no_history, no_link]),
%% Meck Cm
ok = meck:new(emqx_cm, [passthrough, no_history, no_link]),
ok = meck:expect(emqx_cm, increment_connected_client_count, fun() -> ok end),
ok = meck:expect(emqx_cm, decrement_connected_client_count, fun() -> ok end),
ok = meck:expect(emqx_cm, increment_connected_client_count, fun(_) -> ok end),
ok = meck:expect(emqx_cm, decrement_connected_client_count, fun(_) -> ok end),
%% Meck Limiter
ok = meck:new(emqx_limiter, [passthrough, no_history, no_link]),
%% Meck Pd

View File

@ -50,8 +50,8 @@ init_per_testcase(TestCase, Config) when
->
%% Meck Cm
ok = meck:new(emqx_cm, [passthrough, no_history, no_link]),
ok = meck:expect(emqx_cm, increment_connected_client_count, fun() -> ok end),
ok = meck:expect(emqx_cm, decrement_connected_client_count, fun() -> ok end),
ok = meck:expect(emqx_cm, increment_connected_client_count, fun(_) -> ok end),
ok = meck:expect(emqx_cm, decrement_connected_client_count, fun(_) -> ok end),
%% Mock cowboy_req
ok = meck:new(cowboy_req, [passthrough, no_history, no_link]),
ok = meck:expect(cowboy_req, header, fun(_, _, _) -> <<>> end),