Merge pull request #13076 from thalesmg/fix-ds-session-count-client-api-r57-20240520

fix(client mgmt api): cache disconnected durable session count for `/clients` api
This commit is contained in:
Thales Macedo Garitezi 2024-05-21 15:34:21 -03:00 committed by GitHub
commit 31a35f2a15
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 148 additions and 54 deletions

View File

@ -21,7 +21,8 @@
%% API %% API
-export([ -export([
start_link/0, start_link/0,
get_subscription_count/0 get_subscription_count/0,
get_disconnected_session_count/0
]). ]).
%% `gen_server' API %% `gen_server' API
@ -39,7 +40,9 @@
%% call/cast/info events %% call/cast/info events
-record(tally_subs, {}). -record(tally_subs, {}).
-record(tally_disconnected_sessions, {}).
-record(get_subscription_count, {}). -record(get_subscription_count, {}).
-record(get_disconnected_session_count, {}).
%%------------------------------------------------------------------------------ %%------------------------------------------------------------------------------
%% API %% API
@ -59,6 +62,16 @@ get_subscription_count() ->
0 0
end. end.
%% @doc Gets a cached view of the cluster-global count of disconnected persistent sessions.
-spec get_disconnected_session_count() -> non_neg_integer().
get_disconnected_session_count() ->
case emqx_persistent_message:is_persistence_enabled() of
true ->
gen_server:call(?MODULE, #get_disconnected_session_count{}, infinity);
false ->
0
end.
%%------------------------------------------------------------------------------ %%------------------------------------------------------------------------------
%% `gen_server' API %% `gen_server' API
%%------------------------------------------------------------------------------ %%------------------------------------------------------------------------------
@ -66,7 +79,10 @@ get_subscription_count() ->
init(_Opts) -> init(_Opts) ->
case emqx_persistent_message:is_persistence_enabled() of case emqx_persistent_message:is_persistence_enabled() of
true -> true ->
State = #{subs_count => 0}, State = #{
subs_count => 0,
disconnected_session_count => 0
},
{ok, State, {continue, #tally_subs{}}}; {ok, State, {continue, #tally_subs{}}};
false -> false ->
ignore ignore
@ -75,11 +91,18 @@ init(_Opts) ->
handle_continue(#tally_subs{}, State0) -> handle_continue(#tally_subs{}, State0) ->
State = tally_persistent_subscriptions(State0), State = tally_persistent_subscriptions(State0),
ensure_subs_tally_timer(), ensure_subs_tally_timer(),
{noreply, State, {continue, #tally_disconnected_sessions{}}};
handle_continue(#tally_disconnected_sessions{}, State0) ->
State = tally_disconnected_persistent_sessions(State0),
ensure_disconnected_sessions_tally_timer(),
{noreply, State}. {noreply, State}.
handle_call(#get_subscription_count{}, _From, State) -> handle_call(#get_subscription_count{}, _From, State) ->
#{subs_count := N} = State, #{subs_count := N} = State,
{reply, N, State}; {reply, N, State};
handle_call(#get_disconnected_session_count{}, _From, State) ->
#{disconnected_session_count := N} = State,
{reply, N, State};
handle_call(_Call, _From, State) -> handle_call(_Call, _From, State) ->
{reply, {error, bad_call}, State}. {reply, {error, bad_call}, State}.
@ -90,6 +113,10 @@ handle_info(#tally_subs{}, State0) ->
State = tally_persistent_subscriptions(State0), State = tally_persistent_subscriptions(State0),
ensure_subs_tally_timer(), ensure_subs_tally_timer(),
{noreply, State}; {noreply, State};
handle_info(#tally_disconnected_sessions{}, State0) ->
State = tally_disconnected_persistent_sessions(State0),
ensure_disconnected_sessions_tally_timer(),
{noreply, State};
handle_info(_Info, State) -> handle_info(_Info, State) ->
{noreply, State}. {noreply, State}.
@ -101,7 +128,38 @@ tally_persistent_subscriptions(State0) ->
N = emqx_persistent_session_ds_state:total_subscription_count(), N = emqx_persistent_session_ds_state:total_subscription_count(),
State0#{subs_count := N}. State0#{subs_count := N}.
tally_disconnected_persistent_sessions(State0) ->
N = do_tally_disconnected_persistent_sessions(),
State0#{disconnected_session_count := N}.
ensure_subs_tally_timer() -> ensure_subs_tally_timer() ->
Timeout = emqx_config:get([durable_sessions, subscription_count_refresh_interval]), Timeout = emqx_config:get([durable_sessions, subscription_count_refresh_interval]),
_ = erlang:send_after(Timeout, self(), #tally_subs{}), _ = erlang:send_after(Timeout, self(), #tally_subs{}),
ok. ok.
ensure_disconnected_sessions_tally_timer() ->
Timeout = emqx_config:get([durable_sessions, disconnected_session_count_refresh_interval]),
_ = erlang:send_after(Timeout, self(), #tally_disconnected_sessions{}),
ok.
do_tally_disconnected_persistent_sessions() ->
Iter = emqx_persistent_session_ds_state:make_session_iterator(),
do_tally_disconnected_persistent_sessions(Iter, 0).
do_tally_disconnected_persistent_sessions('$end_of_table', N) ->
N;
do_tally_disconnected_persistent_sessions(Iter0, N) ->
case emqx_persistent_session_ds_state:session_iterator_next(Iter0, 1) of
{[], _} ->
N;
{[{Id, _Meta}], Iter} ->
case is_live_session(Id) of
true ->
do_tally_disconnected_persistent_sessions(Iter, N);
false ->
do_tally_disconnected_persistent_sessions(Iter, N + 1)
end
end.
is_live_session(Id) ->
[] =/= emqx_cm_registry:lookup_channels(Id).

View File

@ -1719,6 +1719,14 @@ fields("durable_sessions") ->
importance => ?IMPORTANCE_HIDDEN importance => ?IMPORTANCE_HIDDEN
} }
)}, )},
{"disconnected_session_count_refresh_interval",
sc(
timeout_duration(),
#{
default => <<"5s">>,
importance => ?IMPORTANCE_HIDDEN
}
)},
{"message_retention_period", {"message_retention_period",
sc( sc(
timeout_duration(), timeout_duration(),

View File

@ -476,6 +476,7 @@ zone_global_defaults() ->
renew_streams_interval => 5000, renew_streams_interval => 5000,
session_gc_batch_size => 100, session_gc_batch_size => 100,
session_gc_interval => 600000, session_gc_interval => 600000,
subscription_count_refresh_interval => 5000 subscription_count_refresh_interval => 5000,
disconnected_session_count_refresh_interval => 5000
} }
}. }.

View File

@ -72,6 +72,7 @@
]). ]).
-define(GAUGE_SAMPLER_LIST, [ -define(GAUGE_SAMPLER_LIST, [
disconnected_durable_sessions,
durable_subscriptions, durable_subscriptions,
subscriptions, subscriptions,
topics, topics,

View File

@ -262,6 +262,8 @@ merge_cluster_rate(Node, Cluster) ->
Fun = Fun =
fun fun
%% cluster-synced values %% cluster-synced values
(disconnected_durable_sessions, V, NCluster) ->
NCluster#{disconnected_durable_sessions => V};
(durable_subscriptions, V, NCluster) -> (durable_subscriptions, V, NCluster) ->
NCluster#{durable_subscriptions => V}; NCluster#{durable_subscriptions => V};
(topics, V, NCluster) -> (topics, V, NCluster) ->
@ -417,22 +419,40 @@ getstats(Key) ->
_:_ -> 0 _:_ -> 0
end. end.
stats(connections) -> emqx_stats:getstat('connections.count'); stats(connections) ->
stats(durable_subscriptions) -> emqx_stats:getstat('durable_subscriptions.count'); emqx_stats:getstat('connections.count');
stats(live_connections) -> emqx_stats:getstat('live_connections.count'); stats(disconnected_durable_sessions) ->
stats(cluster_sessions) -> emqx_stats:getstat('cluster_sessions.count'); emqx_persistent_session_bookkeeper:get_disconnected_session_count();
stats(topics) -> emqx_stats:getstat('topics.count'); stats(durable_subscriptions) ->
stats(subscriptions) -> emqx_stats:getstat('subscriptions.count'); emqx_stats:getstat('durable_subscriptions.count');
stats(shared_subscriptions) -> emqx_stats:getstat('subscriptions.shared.count'); stats(live_connections) ->
stats(retained_msg_count) -> emqx_stats:getstat('retained.count'); emqx_stats:getstat('live_connections.count');
stats(received) -> emqx_metrics:val('messages.received'); stats(cluster_sessions) ->
stats(received_bytes) -> emqx_metrics:val('bytes.received'); emqx_stats:getstat('cluster_sessions.count');
stats(sent) -> emqx_metrics:val('messages.sent'); stats(topics) ->
stats(sent_bytes) -> emqx_metrics:val('bytes.sent'); emqx_stats:getstat('topics.count');
stats(validation_succeeded) -> emqx_metrics:val('messages.validation_succeeded'); stats(subscriptions) ->
stats(validation_failed) -> emqx_metrics:val('messages.validation_failed'); emqx_stats:getstat('subscriptions.count');
stats(dropped) -> emqx_metrics:val('messages.dropped'); stats(shared_subscriptions) ->
stats(persisted) -> emqx_metrics:val('messages.persisted'). emqx_stats:getstat('subscriptions.shared.count');
stats(retained_msg_count) ->
emqx_stats:getstat('retained.count');
stats(received) ->
emqx_metrics:val('messages.received');
stats(received_bytes) ->
emqx_metrics:val('bytes.received');
stats(sent) ->
emqx_metrics:val('messages.sent');
stats(sent_bytes) ->
emqx_metrics:val('bytes.sent');
stats(validation_succeeded) ->
emqx_metrics:val('messages.validation_succeeded');
stats(validation_failed) ->
emqx_metrics:val('messages.validation_failed');
stats(dropped) ->
emqx_metrics:val('messages.dropped');
stats(persisted) ->
emqx_metrics:val('messages.persisted').
%% ------------------------------------------------------------------------------------------------- %% -------------------------------------------------------------------------------------------------
%% Retained && License Quota %% Retained && License Quota

View File

@ -194,6 +194,8 @@ swagger_desc(validation_failed) ->
swagger_desc_format("Schema validations failed "); swagger_desc_format("Schema validations failed ");
swagger_desc(persisted) -> swagger_desc(persisted) ->
swagger_desc_format("Messages saved to the durable storage "); swagger_desc_format("Messages saved to the durable storage ");
swagger_desc(disconnected_durable_sessions) ->
<<"Disconnected durable sessions at the time of sampling.", ?APPROXIMATE_DESC>>;
swagger_desc(durable_subscriptions) -> swagger_desc(durable_subscriptions) ->
<<"Subscriptions from durable sessions at the time of sampling.", ?APPROXIMATE_DESC>>; <<"Subscriptions from durable sessions at the time of sampling.", ?APPROXIMATE_DESC>>;
swagger_desc(subscriptions) -> swagger_desc(subscriptions) ->

View File

@ -341,6 +341,8 @@ t_persistent_session_stats(_Config) ->
?retry(1_000, 10, begin ?retry(1_000, 10, begin
?assertMatch( ?assertMatch(
{ok, #{ {ok, #{
<<"connections">> := 2,
<<"disconnected_durable_sessions">> := 0,
%% N.B.: we currently don't perform any deduplication between persistent %% N.B.: we currently don't perform any deduplication between persistent
%% and non-persistent routes, so we count `commont/topic' twice and get 8 %% and non-persistent routes, so we count `commont/topic' twice and get 8
%% instead of 6 here. %% instead of 6 here.
@ -356,6 +358,29 @@ t_persistent_session_stats(_Config) ->
?assert(PSRouteCount > 0, #{ps_route_count => PSRouteCount}), ?assert(PSRouteCount > 0, #{ps_route_count => PSRouteCount}),
PSSubCount = emqx_persistent_session_bookkeeper:get_subscription_count(), PSSubCount = emqx_persistent_session_bookkeeper:get_subscription_count(),
?assert(PSSubCount > 0, #{ps_sub_count => PSSubCount}), ?assert(PSSubCount > 0, #{ps_sub_count => PSSubCount}),
%% Now with disconnected but alive persistent sessions
{ok, {ok, _}} =
?wait_async_action(
emqtt:disconnect(PSClient),
#{?snk_kind := dashboard_monitor_flushed}
),
?retry(1_000, 10, begin
?assertMatch(
{ok, #{
<<"connections">> := 1,
<<"disconnected_durable_sessions">> := 1,
%% N.B.: we currently don't perform any deduplication between persistent
%% and non-persistent routes, so we count `commont/topic' twice and get 8
%% instead of 6 here.
<<"topics">> := 8,
<<"durable_subscriptions">> := 4,
<<"subscriptions">> := 4
}},
request(["monitor_current"])
)
end),
ok. ok.
request(Path) -> request(Path) ->

View File

@ -1384,7 +1384,6 @@ do_list_clients_cluster_query(
{Rows, QueryState1 = #{complete := Complete0}} -> {Rows, QueryState1 = #{complete := Complete0}} ->
case emqx_mgmt_api:accumulate_query_rows(Node, Rows, QueryState1, ResultAcc) of case emqx_mgmt_api:accumulate_query_rows(Node, Rows, QueryState1, ResultAcc) of
{enough, NResultAcc} -> {enough, NResultAcc} ->
%% TODO: add persistent session count?
%% TODO: this may return `{error, _, _}'... %% TODO: this may return `{error, _, _}'...
QueryState2 = emqx_mgmt_api:maybe_collect_total_from_tail_nodes( QueryState2 = emqx_mgmt_api:maybe_collect_total_from_tail_nodes(
Tail, QueryState1 Tail, QueryState1
@ -1428,8 +1427,9 @@ add_persistent_session_count(QueryState0 = #{total := Totals0}) ->
%% to traverse the whole table), but also hard to deduplicate live connections %% to traverse the whole table), but also hard to deduplicate live connections
%% from it... So this count will possibly overshoot the true count of %% from it... So this count will possibly overshoot the true count of
%% sessions. %% sessions.
SessionCount = persistent_session_count(), DisconnectedSessionCount =
Totals = Totals0#{undefined => SessionCount}, emqx_persistent_session_bookkeeper:get_disconnected_session_count(),
Totals = Totals0#{undefined => DisconnectedSessionCount},
QueryState0#{total := Totals}; QueryState0#{total := Totals};
false -> false ->
QueryState0 QueryState0
@ -1477,27 +1477,6 @@ no_persistent_sessions() ->
true true
end. end.
-spec persistent_session_count() -> non_neg_integer().
persistent_session_count() ->
%% N.B.: this is potentially costly. Should not be called in hot paths.
%% `mnesia:table_info(_, size)' is always zero for rocksdb, so we need to traverse...
do_persistent_session_count(init_persistent_session_iterator(), 0).
do_persistent_session_count('$end_of_table', N) ->
N;
do_persistent_session_count(Cursor, N) ->
case emqx_persistent_session_ds_state:session_iterator_next(Cursor, 1) of
{[], _} ->
N;
{[{_Id, Meta}], NextCursor} ->
case is_expired(Meta) of
true ->
do_persistent_session_count(NextCursor, N);
false ->
do_persistent_session_count(NextCursor, N + 1)
end
end.
is_expired(#{last_alive_at := LastAliveAt, expiry_interval := ExpiryInterval}) -> is_expired(#{last_alive_at := LastAliveAt, expiry_interval := ExpiryInterval}) ->
LastAliveAt + ExpiryInterval < erlang:system_time(millisecond). LastAliveAt + ExpiryInterval < erlang:system_time(millisecond).

View File

@ -79,7 +79,9 @@ end_per_suite(Config) ->
init_per_group(persistent_sessions, Config) -> init_per_group(persistent_sessions, Config) ->
AppSpecs = [ AppSpecs = [
{emqx, "durable_sessions.enable = true"}, {emqx,
"durable_sessions.enable = true\n"
"durable_sessions.disconnected_session_count_refresh_interval = 100ms"},
emqx_management emqx_management
], ],
Dashboard = emqx_mgmt_api_test_util:emqx_dashboard( Dashboard = emqx_mgmt_api_test_util:emqx_dashboard(
@ -457,9 +459,7 @@ t_persistent_sessions5(Config) ->
{{_, 200, _}, _, #{ {{_, 200, _}, _, #{
<<"data">> := [_, _, _], <<"data">> := [_, _, _],
<<"meta">> := #{ <<"meta">> := #{
%% TODO: if/when we fix the persistent session count, this <<"count">> := 4,
%% should be 4.
<<"count">> := 6,
<<"hasnext">> := true <<"hasnext">> := true
} }
}}}, }}},
@ -470,9 +470,7 @@ t_persistent_sessions5(Config) ->
{{_, 200, _}, _, #{ {{_, 200, _}, _, #{
<<"data">> := [_], <<"data">> := [_],
<<"meta">> := #{ <<"meta">> := #{
%% TODO: if/when we fix the persistent session count, this <<"count">> := 4,
%% should be 4.
<<"count">> := 6,
<<"hasnext">> := false <<"hasnext">> := false
} }
}}}, }}},
@ -489,9 +487,7 @@ t_persistent_sessions5(Config) ->
{{_, 200, _}, _, #{ {{_, 200, _}, _, #{
<<"data">> := [_, _], <<"data">> := [_, _],
<<"meta">> := #{ <<"meta">> := #{
%% TODO: if/when we fix the persistent session count, this <<"count">> := 4,
%% should be 4.
<<"count">> := 6,
<<"hasnext">> := true <<"hasnext">> := true
} }
}}}, }}},
@ -1996,7 +1992,11 @@ assert_single_client(Opts) ->
100, 100,
20, 20,
?assertMatch( ?assertMatch(
{ok, {{_, 200, _}, _, #{<<"data">> := [#{<<"connected">> := IsConnected}]}}}, {ok,
{{_, 200, _}, _, #{
<<"data">> := [#{<<"connected">> := IsConnected}],
<<"meta">> := #{<<"count">> := 1}
}}},
list_request(APIPort) list_request(APIPort)
) )
), ),