feat(ds): Add egress metrics

This commit is contained in:
ieQu1 2024-03-21 15:03:33 +01:00
parent c9de336234
commit 606f2a88cd
No known key found for this signature in database
GPG Key ID: 488654DF3FED6FDE
3 changed files with 115 additions and 33 deletions

View File

@ -16,7 +16,14 @@
-module(emqx_ds_builtin_metrics). -module(emqx_ds_builtin_metrics).
%% API: %% API:
-export([child_spec/0, init_for_db/1, init_for_shard/2]). -export([child_spec/0, init_for_db/1, shard_metric_id/2, init_for_shard/1]).
-export([
inc_egress_batches/1,
inc_egress_batches_retry/1,
inc_egress_messages/2,
inc_egress_bytes/2,
observe_egress_flush_time/2
]).
%% behavior callbacks: %% behavior callbacks:
-export([]). -export([]).
@ -24,7 +31,7 @@
%% internal exports: %% internal exports:
-export([]). -export([]).
-export_type([]). -export_type([shard_metrics_id/0]).
%%================================================================================ %%================================================================================
%% Type declarations %% Type declarations
@ -32,18 +39,17 @@
-define(WORKER, ?MODULE). -define(WORKER, ?MODULE).
-define(DB_METRICS, -define(DB_METRICS, []).
[
]). -define(SHARD_METRICS, [
'egress.batches',
'egress.batches.retry',
'egress.messages',
'egress.bytes',
{slide, 'egress.flush_time'}
]).
-define(SHARD_METRICS, -type shard_metrics_id() :: binary().
[
'egress.bytes',
'egress.batches',
'egress.messages',
{slide, 'egress.flush_time'}
]).
%%================================================================================ %%================================================================================
%% API functions %% API functions
@ -57,18 +63,39 @@ child_spec() ->
init_for_db(DB) -> init_for_db(DB) ->
emqx_metrics_worker:create_metrics(?WORKER, DB, ?DB_METRICS, []). emqx_metrics_worker:create_metrics(?WORKER, DB, ?DB_METRICS, []).
-spec init_for_shard(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> ok. -spec shard_metric_id(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> shard_metrics_id().
init_for_shard(DB, ShardId) -> shard_metric_id(DB, ShardId) ->
Id = iolist_to_binary([atom_to_list(DB), $/, ShardId]), iolist_to_binary([atom_to_list(DB), $/, ShardId]).
emqx_metrics_worker:create_metrics(?WORKER, Id, ?SHARD_METRICS, []).
%%================================================================================ -spec init_for_shard(shard_metrics_id()) -> ok.
%% behavior callbacks init_for_shard(ShardId) ->
%%================================================================================ emqx_metrics_worker:create_metrics(?WORKER, ShardId, ?SHARD_METRICS, []).
%%================================================================================ %% @doc Increase the number of successfully flushed batches
%% Internal exports -spec inc_egress_batches(shard_metrics_id()) -> ok.
%%================================================================================ inc_egress_batches(Id) ->
emqx_metrics_worker:inc(?WORKER, Id, 'egress.batches').
%% @doc Increase the number of time the egress worker had to retry
%% flushing the batch
-spec inc_egress_batches_retry(shard_metrics_id()) -> ok.
inc_egress_batches_retry(Id) ->
emqx_metrics_worker:inc(?WORKER, Id, 'egress.batches.retry').
%% @doc Increase the number of messages successfully saved to the shard
-spec inc_egress_messages(shard_metrics_id(), non_neg_integer()) -> ok.
inc_egress_messages(Id, NMessages) ->
emqx_metrics_worker:inc(?WORKER, Id, 'egress.messages', NMessages).
%% @doc Increase the number of messages successfully saved to the shard
-spec inc_egress_bytes(shard_metrics_id(), non_neg_integer()) -> ok.
inc_egress_bytes(Id, NMessages) ->
emqx_metrics_worker:inc(?WORKER, Id, 'egress.bytes', NMessages).
%% @doc Add a sample of time spent flushing the egress to the Raft log (in microseconds)
-spec observe_egress_flush_time(shard_metrics_id(), non_neg_integer()) -> ok.
observe_egress_flush_time(Id, FlushTime) ->
emqx_metrics_worker:observe(?WORKER, Id, 'egress.flush_time', FlushTime).
%%================================================================================ %%================================================================================
%% Internal functions %% Internal functions

View File

@ -40,6 +40,7 @@
-export_type([]). -export_type([]).
-include_lib("emqx_utils/include/emqx_message.hrl").
-include_lib("snabbkaffe/include/trace.hrl"). -include_lib("snabbkaffe/include/trace.hrl").
%%================================================================================ %%================================================================================
@ -49,8 +50,16 @@
-define(via(DB, Shard), {via, gproc, {n, l, {?MODULE, DB, Shard}}}). -define(via(DB, Shard), {via, gproc, {n, l, {?MODULE, DB, Shard}}}).
-define(flush, flush). -define(flush, flush).
-record(enqueue_req, {message :: emqx_types:message(), sync :: boolean()}). -record(enqueue_req, {
-record(enqueue_atomic_req, {batch :: [emqx_types:message()], sync :: boolean()}). message :: emqx_types:message(),
sync :: boolean(),
payload_bytes :: non_neg_integer()
}).
-record(enqueue_atomic_req, {
batch :: [emqx_types:message()],
sync :: boolean(),
payload_bytes :: non_neg_integer()
}).
%%================================================================================ %%================================================================================
%% API functions %% API functions
@ -73,7 +82,8 @@ store_batch(DB, Messages, Opts) ->
?via(DB, Shard), ?via(DB, Shard),
#enqueue_req{ #enqueue_req{
message = Message, message = Message,
sync = Sync sync = Sync,
payload_bytes = payload_size(Message)
}, },
infinity infinity
) )
@ -83,11 +93,19 @@ store_batch(DB, Messages, Opts) ->
true -> true ->
maps:foreach( maps:foreach(
fun(Shard, Batch) -> fun(Shard, Batch) ->
PayloadBytes = lists:foldl(
fun(Msg, Acc) ->
Acc + payload_size(Msg)
end,
0,
Batch
),
gen_server:call( gen_server:call(
?via(DB, Shard), ?via(DB, Shard),
#enqueue_atomic_req{ #enqueue_atomic_req{
batch = Batch, batch = Batch,
sync = Sync sync = Sync,
payload_bytes = PayloadBytes
}, },
infinity infinity
) )
@ -108,7 +126,9 @@ store_batch(DB, Messages, Opts) ->
-record(s, { -record(s, {
db :: emqx_ds:db(), db :: emqx_ds:db(),
shard :: emqx_ds_replication_layer:shard_id(), shard :: emqx_ds_replication_layer:shard_id(),
metrics_id :: emqx_ds_builtin_metrics:shard_metrics_id(),
n = 0 :: non_neg_integer(), n = 0 :: non_neg_integer(),
n_bytes = 0 :: non_neg_integer(),
tref :: reference(), tref :: reference(),
batch = [] :: [emqx_types:message()], batch = [] :: [emqx_types:message()],
pending_replies = [] :: [gen_server:from()] pending_replies = [] :: [gen_server:from()]
@ -117,18 +137,21 @@ store_batch(DB, Messages, Opts) ->
init([DB, Shard]) -> init([DB, Shard]) ->
process_flag(trap_exit, true), process_flag(trap_exit, true),
process_flag(message_queue_data, off_heap), process_flag(message_queue_data, off_heap),
MetricsId = emqx_ds_builtin_metrics:shard_metric_id(DB, Shard),
ok = emqx_ds_builtin_metrics:init_for_shard(MetricsId),
S = #s{ S = #s{
db = DB, db = DB,
shard = Shard, shard = Shard,
metrics_id = MetricsId,
tref = start_timer() tref = start_timer()
}, },
{ok, S}. {ok, S}.
handle_call(#enqueue_req{message = Msg, sync = Sync}, From, S) -> handle_call(#enqueue_req{message = Msg, sync = Sync, payload_bytes = NBytes}, From, S) ->
do_enqueue(From, Sync, Msg, S); do_enqueue(From, Sync, Msg, NBytes, S);
handle_call(#enqueue_atomic_req{batch = Batch, sync = Sync}, From, S) -> handle_call(#enqueue_atomic_req{batch = Batch, sync = Sync, payload_bytes = NBytes}, From, S) ->
Len = length(Batch), Len = length(Batch),
do_enqueue(From, Sync, {atomic, Len, Batch}, S); do_enqueue(From, Sync, {atomic, Len, NBytes, Batch}, NBytes, S);
handle_call(_Call, _From, S) -> handle_call(_Call, _From, S) ->
{reply, {error, unknown_call}, S}. {reply, {error, unknown_call}, S}.
@ -161,6 +184,11 @@ do_flush(
) -> ) ->
case emqx_ds_replication_layer:ra_store_batch(DB, Shard, lists:reverse(Messages)) of case emqx_ds_replication_layer:ra_store_batch(DB, Shard, lists:reverse(Messages)) of
ok -> ok ->
emqx_ds_builtin_metrics:inc_egress_batches(S#s.metrics_id),
emqx_ds_builtin_metrics:inc_egress_messages(S#s.metrics_id, S#s.n),
emqx_ds_builtin_metrics:inc_egress_bytes(S#s.metrics_id, S#s.n_bytes),
lists:foreach(fun(From) -> gen_server:reply(From, ok) end, Replies),
true = erlang:garbage_collect(),
?tp( ?tp(
emqx_ds_replication_layer_egress_flush, emqx_ds_replication_layer_egress_flush,
#{db => DB, shard => Shard, batch => Messages} #{db => DB, shard => Shard, batch => Messages}
@ -169,6 +197,7 @@ do_flush(
true = erlang:garbage_collect(), true = erlang:garbage_collect(),
ok; ok;
Error -> Error ->
emqx_ds_builtin_metrics:inc_egress_batches_retry(S#s.metrics_id),
true = erlang:garbage_collect(), true = erlang:garbage_collect(),
?tp( ?tp(
warning, warning,
@ -184,19 +213,27 @@ do_flush(
end, end,
S#s{ S#s{
n = 0, n = 0,
n_bytes = 0,
batch = [], batch = [],
pending_replies = [], pending_replies = [],
tref = start_timer() tref = start_timer()
}. }.
do_enqueue(From, Sync, MsgOrBatch, S0 = #s{n = N, batch = Batch, pending_replies = Replies}) -> do_enqueue(
From,
Sync,
MsgOrBatch,
BatchBytes,
S0 = #s{n = N, n_bytes = NBytes0, batch = Batch, pending_replies = Replies}
) ->
NBytes = NBytes0 + BatchBytes,
NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000), NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000),
S1 = S1 =
case MsgOrBatch of case MsgOrBatch of
{atomic, NumMsgs, Msgs} -> {atomic, NumMsgs, Msgs} ->
S0#s{n = N + NumMsgs, batch = Msgs ++ Batch}; S0#s{n = N + NumMsgs, n_bytes = NBytes, batch = Msgs ++ Batch};
Msg -> Msg ->
S0#s{n = N + 1, batch = [Msg | Batch]} S0#s{n = N + 1, n_bytes = NBytes, batch = [Msg | Batch]}
end, end,
%% TODO: later we may want to delay the reply until the message is %% TODO: later we may want to delay the reply until the message is
%% replicated, but it requies changes to the PUBACK/PUBREC flow to %% replicated, but it requies changes to the PUBACK/PUBREC flow to
@ -228,3 +265,8 @@ do_enqueue(From, Sync, MsgOrBatch, S0 = #s{n = N, batch = Batch, pending_replies
start_timer() -> start_timer() ->
Interval = application:get_env(emqx_durable_storage, egress_flush_interval, 100), Interval = application:get_env(emqx_durable_storage, egress_flush_interval, 100),
erlang:send_after(Interval, self(), ?flush). erlang:send_after(Interval, self(), ?flush).
%% @doc Return approximate size of the MQTT message (it doesn't take
%% all things into account, for example headers and extras)
payload_size(#message{payload = P, topic = T}) ->
size(P) + size(T).

View File

@ -212,6 +212,7 @@ collect_mf(?PROMETHEUS_DEFAULT_REGISTRY, Callback) ->
ok = add_collect_family(Callback, cert_metric_meta(), ?MG(cert_data, RawData)), ok = add_collect_family(Callback, cert_metric_meta(), ?MG(cert_data, RawData)),
ok = add_collect_family(Callback, mria_metric_meta(), ?MG(mria_data, RawData)), ok = add_collect_family(Callback, mria_metric_meta(), ?MG(mria_data, RawData)),
ok = add_collect_family(Callback, ds_metric_meta(), ?MG(ds_data, RawData)),
ok = maybe_license_add_collect_family(Callback, RawData), ok = maybe_license_add_collect_family(Callback, RawData),
ok; ok;
collect_mf(_Registry, _Callback) -> collect_mf(_Registry, _Callback) ->
@ -1011,6 +1012,18 @@ catch_all(DataFun) ->
_:_ -> undefined _:_ -> undefined
end. end.
%%========================================
%% Durable storge
%%========================================
ds_metric_meta() ->
[
{emqx_ds_egress_batches, counter, 'egress.batches'},
{emqx_ds_egress_batches_retry, counter, 'egress.batches.retry'},
{emqx_ds_egress_messages, counter, 'egress.messages'},
{emqx_ds_egress_bytes, counter, 'egress.bytes'}
].
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------
%% Collect functions %% Collect functions
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------