Merge pull request #9619 from thalesmg/refactor-gauges-v50

refactor(metrics): use absolute gauge values rather than deltas (v5.0)
This commit is contained in:
Thales Macedo Garitezi 2023-01-02 10:56:47 -03:00 committed by GitHub
commit 7e02eac3bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 687 additions and 212 deletions

View File

@ -18,6 +18,8 @@
-behaviour(gen_server). -behaviour(gen_server).
-include_lib("stdlib/include/ms_transform.hrl").
%% API functions %% API functions
-export([ -export([
start_link/1, start_link/1,
@ -30,6 +32,11 @@
inc/3, inc/3,
inc/4, inc/4,
get/3, get/3,
get_gauge/3,
set_gauge/5,
shift_gauge/5,
get_gauges/2,
delete_gauges/2,
get_rate/2, get_rate/2,
get_counters/2, get_counters/2,
create_metrics/3, create_metrics/3,
@ -68,14 +75,21 @@
last5m := float() last5m := float()
}. }.
-type metrics() :: #{ -type metrics() :: #{
counters := #{atom() => integer()}, counters := #{metric_name() => integer()},
rate := #{atom() => rate()} gauges := #{metric_name() => integer()},
rate := #{metric_name() => rate()}
}. }.
-type handler_name() :: atom(). -type handler_name() :: atom().
%% metric_id() is actually a resource id
-type metric_id() :: binary() | atom(). -type metric_id() :: binary() | atom().
-type metric_name() :: atom().
-type worker_id() :: term().
-define(CntrRef(Name), {?MODULE, Name}). -define(CntrRef(Name), {?MODULE, Name}).
-define(SAMPCOUNT_5M, (?SECS_5M div ?SAMPLING)). -define(SAMPCOUNT_5M, (?SECS_5M div ?SAMPLING)).
-define(GAUGE_TABLE(NAME),
list_to_atom(atom_to_list(?MODULE) ++ "_" ++ atom_to_list(NAME) ++ "_gauge")
).
-record(rate, { -record(rate, {
max = 0 :: number(), max = 0 :: number(),
@ -112,11 +126,12 @@ child_spec(ChldName, Name) ->
modules => [emqx_metrics_worker] modules => [emqx_metrics_worker]
}. }.
-spec create_metrics(handler_name(), metric_id(), [atom()]) -> ok | {error, term()}. -spec create_metrics(handler_name(), metric_id(), [metric_name()]) -> ok | {error, term()}.
create_metrics(Name, Id, Metrics) -> create_metrics(Name, Id, Metrics) ->
create_metrics(Name, Id, Metrics, Metrics). create_metrics(Name, Id, Metrics, Metrics).
-spec create_metrics(handler_name(), metric_id(), [atom()], [atom()]) -> ok | {error, term()}. -spec create_metrics(handler_name(), metric_id(), [metric_name()], [metric_name()]) ->
ok | {error, term()}.
create_metrics(Name, Id, Metrics, RateMetrics) -> create_metrics(Name, Id, Metrics, RateMetrics) ->
gen_server:call(Name, {create_metrics, Id, Metrics, RateMetrics}). gen_server:call(Name, {create_metrics, Id, Metrics, RateMetrics}).
@ -135,7 +150,7 @@ has_metrics(Name, Id) ->
_ -> true _ -> true
end. end.
-spec get(handler_name(), metric_id(), atom() | integer()) -> number(). -spec get(handler_name(), metric_id(), metric_name() | integer()) -> number().
get(Name, Id, Metric) -> get(Name, Id, Metric) ->
case get_ref(Name, Id) of case get_ref(Name, Id) of
not_found -> not_found ->
@ -167,16 +182,102 @@ reset_counters(Name, Id) ->
-spec get_metrics(handler_name(), metric_id()) -> metrics(). -spec get_metrics(handler_name(), metric_id()) -> metrics().
get_metrics(Name, Id) -> get_metrics(Name, Id) ->
#{rate => get_rate(Name, Id), counters => get_counters(Name, Id)}. #{
rate => get_rate(Name, Id),
counters => get_counters(Name, Id),
gauges => get_gauges(Name, Id)
}.
-spec inc(handler_name(), metric_id(), atom()) -> ok. -spec inc(handler_name(), metric_id(), atom()) -> ok.
inc(Name, Id, Metric) -> inc(Name, Id, Metric) ->
inc(Name, Id, Metric, 1). inc(Name, Id, Metric, 1).
-spec inc(handler_name(), metric_id(), atom(), integer()) -> ok. -spec inc(handler_name(), metric_id(), metric_name(), integer()) -> ok.
inc(Name, Id, Metric, Val) -> inc(Name, Id, Metric, Val) ->
counters:add(get_ref(Name, Id), idx_metric(Name, Id, Metric), Val). counters:add(get_ref(Name, Id), idx_metric(Name, Id, Metric), Val).
-spec set_gauge(handler_name(), metric_id(), worker_id(), metric_name(), integer()) -> ok.
set_gauge(Name, Id, WorkerId, Metric, Val) ->
Table = ?GAUGE_TABLE(Name),
try
true = ets:insert(Table, {{Id, Metric, WorkerId}, Val}),
ok
catch
error:badarg ->
ok
end.
-spec shift_gauge(handler_name(), metric_id(), worker_id(), metric_name(), integer()) -> ok.
shift_gauge(Name, Id, WorkerId, Metric, Val) ->
Table = ?GAUGE_TABLE(Name),
try
_ = ets:update_counter(
Table,
{Id, Metric, WorkerId},
Val,
{{Id, Metric, WorkerId}, 0}
),
ok
catch
error:badarg ->
ok
end.
-spec get_gauge(handler_name(), metric_id(), metric_name()) -> integer().
get_gauge(Name, Id, Metric) ->
Table = ?GAUGE_TABLE(Name),
MatchSpec =
ets:fun2ms(
fun({{Id0, Metric0, _WorkerId}, Val}) when Id0 =:= Id, Metric0 =:= Metric ->
Val
end
),
try
lists:sum(ets:select(Table, MatchSpec))
catch
error:badarg ->
0
end.
-spec get_gauges(handler_name(), metric_id()) -> map().
get_gauges(Name, Id) ->
Table = ?GAUGE_TABLE(Name),
MatchSpec =
ets:fun2ms(
fun({{Id0, Metric, _WorkerId}, Val}) when Id0 =:= Id ->
{Metric, Val}
end
),
try
lists:foldr(
fun({Metric, Val}, Acc) ->
maps:update_with(Metric, fun(X) -> X + Val end, Val, Acc)
end,
#{},
ets:select(Table, MatchSpec)
)
catch
error:badarg ->
#{}
end.
-spec delete_gauges(handler_name(), metric_id()) -> ok.
delete_gauges(Name, Id) ->
Table = ?GAUGE_TABLE(Name),
MatchSpec =
ets:fun2ms(
fun({{Id0, _Metric, _WorkerId}, _Val}) when Id0 =:= Id ->
true
end
),
try
_ = ets:select_delete(Table, MatchSpec),
ok
catch
error:badarg ->
ok
end.
start_link(Name) -> start_link(Name) ->
gen_server:start_link({local, Name}, ?MODULE, Name, []). gen_server:start_link({local, Name}, ?MODULE, Name, []).
@ -185,6 +286,7 @@ init(Name) ->
%% the rate metrics %% the rate metrics
erlang:send_after(timer:seconds(?SAMPLING), self(), ticking), erlang:send_after(timer:seconds(?SAMPLING), self(), ticking),
persistent_term:put(?CntrRef(Name), #{}), persistent_term:put(?CntrRef(Name), #{}),
_ = ets:new(?GAUGE_TABLE(Name), [named_table, ordered_set, public, {write_concurrency, true}]),
{ok, #state{}}. {ok, #state{}}.
handle_call({get_rate, _Id}, _From, State = #state{rates = undefined}) -> handle_call({get_rate, _Id}, _From, State = #state{rates = undefined}) ->
@ -220,7 +322,10 @@ handle_call(
_From, _From,
State = #state{metric_ids = MIDs, rates = Rates} State = #state{metric_ids = MIDs, rates = Rates}
) -> ) ->
{reply, delete_counters(get_self_name(), Id), State#state{ Name = get_self_name(),
delete_counters(Name, Id),
delete_gauges(Name, Id),
{reply, ok, State#state{
metric_ids = sets:del_element(Id, MIDs), metric_ids = sets:del_element(Id, MIDs),
rates = rates =
case Rates of case Rates of
@ -233,7 +338,9 @@ handle_call(
_From, _From,
State = #state{rates = Rates} State = #state{rates = Rates}
) -> ) ->
{reply, reset_counters(get_self_name(), Id), State#state{ Name = get_self_name(),
delete_gauges(Name, Id),
{reply, reset_counters(Name, Id), State#state{
rates = rates =
case Rates of case Rates of
undefined -> undefined ->

View File

@ -65,7 +65,8 @@
-export([clear_screen/0]). -export([clear_screen/0]).
-export([with_mock/4]). -export([with_mock/4]).
-export([ -export([
on_exit/1 on_exit/1,
call_janitor/0
]). ]).
%% Toxiproxy API %% Toxiproxy API
@ -933,6 +934,13 @@ latency_up_proxy(off, Name, ProxyHost, ProxyPort) ->
%% Testcase teardown utilities %% Testcase teardown utilities
%%------------------------------------------------------------------------------- %%-------------------------------------------------------------------------------
%% stop the janitor gracefully to ensure proper cleanup order and less
%% noise in the logs.
call_janitor() ->
Janitor = get_or_spawn_janitor(),
exit(Janitor, normal),
ok.
get_or_spawn_janitor() -> get_or_spawn_janitor() ->
case get({?MODULE, janitor_proc}) of case get({?MODULE, janitor_proc}) of
undefined -> undefined ->

View File

@ -47,7 +47,8 @@ end_per_testcase(_, _Config) ->
t_get_metrics(_) -> t_get_metrics(_) ->
Metrics = [a, b, c], Metrics = [a, b, c],
ok = emqx_metrics_worker:create_metrics(?NAME, <<"testid">>, Metrics), Id = <<"testid">>,
ok = emqx_metrics_worker:create_metrics(?NAME, Id, Metrics),
%% all the metrics are set to zero at start %% all the metrics are set to zero at start
?assertMatch( ?assertMatch(
#{ #{
@ -56,18 +57,22 @@ t_get_metrics(_) ->
b := #{current := 0.0, max := 0.0, last5m := 0.0}, b := #{current := 0.0, max := 0.0, last5m := 0.0},
c := #{current := 0.0, max := 0.0, last5m := 0.0} c := #{current := 0.0, max := 0.0, last5m := 0.0}
}, },
gauges := #{},
counters := #{ counters := #{
a := 0, a := 0,
b := 0, b := 0,
c := 0 c := 0
} }
}, },
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>) emqx_metrics_worker:get_metrics(?NAME, Id)
), ),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, a), ok = emqx_metrics_worker:inc(?NAME, Id, a),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, b), ok = emqx_metrics_worker:inc(?NAME, Id, b),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, c), ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, c), ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, inflight, 5),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, inflight, 7),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id2, queuing, 9),
ct:sleep(1500), ct:sleep(1500),
?LET( ?LET(
#{ #{
@ -76,27 +81,73 @@ t_get_metrics(_) ->
b := #{current := CurrB, max := MaxB, last5m := _}, b := #{current := CurrB, max := MaxB, last5m := _},
c := #{current := CurrC, max := MaxC, last5m := _} c := #{current := CurrC, max := MaxC, last5m := _}
}, },
gauges := #{
inflight := Inflight,
queuing := Queuing
},
counters := #{ counters := #{
a := 1, a := 1,
b := 1, b := 1,
c := 2 c := 2
} }
}, },
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>), emqx_metrics_worker:get_metrics(?NAME, Id),
{ {
?assert(CurrA > 0), ?assert(CurrA > 0),
?assert(CurrB > 0), ?assert(CurrB > 0),
?assert(CurrC > 0), ?assert(CurrC > 0),
?assert(MaxA > 0), ?assert(MaxA > 0),
?assert(MaxB > 0), ?assert(MaxB > 0),
?assert(MaxC > 0) ?assert(MaxC > 0),
?assert(Inflight == 12),
?assert(Queuing == 9)
} }
), ),
ok = emqx_metrics_worker:clear_metrics(?NAME, <<"testid">>). ok = emqx_metrics_worker:clear_metrics(?NAME, Id).
t_clear_metrics(_Config) ->
Metrics = [a, b, c],
Id = <<"testid">>,
ok = emqx_metrics_worker:create_metrics(?NAME, Id, Metrics),
?assertMatch(
#{
rate := #{
a := #{current := 0.0, max := 0.0, last5m := 0.0},
b := #{current := 0.0, max := 0.0, last5m := 0.0},
c := #{current := 0.0, max := 0.0, last5m := 0.0}
},
gauges := #{},
counters := #{
a := 0,
b := 0,
c := 0
}
},
emqx_metrics_worker:get_metrics(?NAME, Id)
),
ok = emqx_metrics_worker:inc(?NAME, Id, a),
ok = emqx_metrics_worker:inc(?NAME, Id, b),
ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, inflight, 5),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, inflight, 7),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id2, queuing, 9),
ct:sleep(1500),
ok = emqx_metrics_worker:clear_metrics(?NAME, Id),
?assertEqual(
#{
counters => #{},
gauges => #{},
rate => #{current => 0.0, last5m => 0.0, max => 0.0}
},
emqx_metrics_worker:get_metrics(?NAME, Id)
),
ok.
t_reset_metrics(_) -> t_reset_metrics(_) ->
Metrics = [a, b, c], Metrics = [a, b, c],
ok = emqx_metrics_worker:create_metrics(?NAME, <<"testid">>, Metrics), Id = <<"testid">>,
ok = emqx_metrics_worker:create_metrics(?NAME, Id, Metrics),
%% all the metrics are set to zero at start %% all the metrics are set to zero at start
?assertMatch( ?assertMatch(
#{ #{
@ -105,20 +156,24 @@ t_reset_metrics(_) ->
b := #{current := 0.0, max := 0.0, last5m := 0.0}, b := #{current := 0.0, max := 0.0, last5m := 0.0},
c := #{current := 0.0, max := 0.0, last5m := 0.0} c := #{current := 0.0, max := 0.0, last5m := 0.0}
}, },
gauges := #{},
counters := #{ counters := #{
a := 0, a := 0,
b := 0, b := 0,
c := 0 c := 0
} }
}, },
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>) emqx_metrics_worker:get_metrics(?NAME, Id)
), ),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, a), ok = emqx_metrics_worker:inc(?NAME, Id, a),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, b), ok = emqx_metrics_worker:inc(?NAME, Id, b),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, c), ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, c), ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, inflight, 5),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, inflight, 7),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id2, queuing, 9),
ct:sleep(1500), ct:sleep(1500),
ok = emqx_metrics_worker:reset_metrics(?NAME, <<"testid">>), ok = emqx_metrics_worker:reset_metrics(?NAME, Id),
?LET( ?LET(
#{ #{
rate := #{ rate := #{
@ -126,68 +181,83 @@ t_reset_metrics(_) ->
b := #{current := CurrB, max := MaxB, last5m := _}, b := #{current := CurrB, max := MaxB, last5m := _},
c := #{current := CurrC, max := MaxC, last5m := _} c := #{current := CurrC, max := MaxC, last5m := _}
}, },
gauges := Gauges,
counters := #{ counters := #{
a := 0, a := 0,
b := 0, b := 0,
c := 0 c := 0
} }
}, },
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>), emqx_metrics_worker:get_metrics(?NAME, Id),
{ {
?assert(CurrA == 0), ?assert(CurrA == 0),
?assert(CurrB == 0), ?assert(CurrB == 0),
?assert(CurrC == 0), ?assert(CurrC == 0),
?assert(MaxA == 0), ?assert(MaxA == 0),
?assert(MaxB == 0), ?assert(MaxB == 0),
?assert(MaxC == 0) ?assert(MaxC == 0),
?assertEqual(#{}, Gauges)
} }
), ),
ok = emqx_metrics_worker:clear_metrics(?NAME, <<"testid">>). ok = emqx_metrics_worker:clear_metrics(?NAME, Id).
t_get_metrics_2(_) -> t_get_metrics_2(_) ->
Metrics = [a, b, c], Metrics = [a, b, c],
Id = <<"testid">>,
ok = emqx_metrics_worker:create_metrics( ok = emqx_metrics_worker:create_metrics(
?NAME, ?NAME,
<<"testid">>, Id,
Metrics, Metrics,
[a] [a]
), ),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, a), ok = emqx_metrics_worker:inc(?NAME, Id, a),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, b), ok = emqx_metrics_worker:inc(?NAME, Id, b),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, c), ok = emqx_metrics_worker:inc(?NAME, Id, c),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, inflight, 5),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, inflight, 7),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id2, queuing, 9),
?assertMatch( ?assertMatch(
#{ #{
rate := Rate = #{ rate := Rate = #{
a := #{current := _, max := _, last5m := _} a := #{current := _, max := _, last5m := _}
}, },
gauges := #{},
counters := #{ counters := #{
a := 1, a := 1,
b := 1, b := 1,
c := 1 c := 1
} }
} when map_size(Rate) =:= 1, } when map_size(Rate) =:= 1,
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>) emqx_metrics_worker:get_metrics(?NAME, Id)
), ),
ok = emqx_metrics_worker:clear_metrics(?NAME, <<"testid">>). ok = emqx_metrics_worker:clear_metrics(?NAME, Id).
t_recreate_metrics(_) -> t_recreate_metrics(_) ->
ok = emqx_metrics_worker:create_metrics(?NAME, <<"testid">>, [a]), Id = <<"testid">>,
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, a), ok = emqx_metrics_worker:create_metrics(?NAME, Id, [a]),
ok = emqx_metrics_worker:inc(?NAME, Id, a),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, inflight, 5),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, inflight, 7),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id2, queuing, 9),
?assertMatch( ?assertMatch(
#{ #{
rate := R = #{ rate := R = #{
a := #{current := _, max := _, last5m := _} a := #{current := _, max := _, last5m := _}
}, },
gauges := #{
inflight := 12,
queuing := 9
},
counters := C = #{ counters := C = #{
a := 1 a := 1
} }
} when map_size(R) == 1 andalso map_size(C) == 1, } when map_size(R) == 1 andalso map_size(C) == 1,
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>) emqx_metrics_worker:get_metrics(?NAME, Id)
), ),
%% we create the metrics again, to add some counters %% we create the metrics again, to add some counters
ok = emqx_metrics_worker:create_metrics(?NAME, <<"testid">>, [a, b, c]), ok = emqx_metrics_worker:create_metrics(?NAME, Id, [a, b, c]),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, b), ok = emqx_metrics_worker:inc(?NAME, Id, b),
ok = emqx_metrics_worker:inc(?NAME, <<"testid">>, c), ok = emqx_metrics_worker:inc(?NAME, Id, c),
?assertMatch( ?assertMatch(
#{ #{
rate := R = #{ rate := R = #{
@ -195,13 +265,17 @@ t_recreate_metrics(_) ->
b := #{current := _, max := _, last5m := _}, b := #{current := _, max := _, last5m := _},
c := #{current := _, max := _, last5m := _} c := #{current := _, max := _, last5m := _}
}, },
gauges := #{
inflight := 12,
queuing := 9
},
counters := C = #{ counters := C = #{
a := 1, b := 1, c := 1 a := 1, b := 1, c := 1
} }
} when map_size(R) == 3 andalso map_size(C) == 3, } when map_size(R) == 3 andalso map_size(C) == 3,
emqx_metrics_worker:get_metrics(?NAME, <<"testid">>) emqx_metrics_worker:get_metrics(?NAME, Id)
), ),
ok = emqx_metrics_worker:clear_metrics(?NAME, <<"testid">>). ok = emqx_metrics_worker:clear_metrics(?NAME, Id).
t_inc_matched(_) -> t_inc_matched(_) ->
Metrics = ['rules.matched'], Metrics = ['rules.matched'],
@ -238,3 +312,102 @@ t_rate(_) ->
ct:sleep(3000), ct:sleep(3000),
ok = emqx_metrics_worker:clear_metrics(?NAME, <<"rule1">>), ok = emqx_metrics_worker:clear_metrics(?NAME, <<"rule1">>),
ok = emqx_metrics_worker:clear_metrics(?NAME, <<"rule:2">>). ok = emqx_metrics_worker:clear_metrics(?NAME, <<"rule:2">>).
t_get_gauge(_Config) ->
Metric = 'queueing',
%% unknown handler name (inexistent table)
?assertEqual(0, emqx_metrics_worker:get_gauge(unknown_name, unknown_id, Metric)),
%% unknown resource id
?assertEqual(0, emqx_metrics_worker:get_gauge(?NAME, unknown_id, Metric)),
Id = <<"some id">>,
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, Metric, 2),
?assertEqual(2, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
?assertEqual(0, emqx_metrics_worker:get_gauge(?NAME, unknown, Metric)),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, Metric, 3),
?assertEqual(5, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, Metric, 1),
?assertEqual(4, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
?assertEqual(0, emqx_metrics_worker:get_gauge(?NAME, Id, another_metric)),
ok.
t_get_gauges(_Config) ->
%% unknown handler name (inexistent table)
?assertEqual(#{}, emqx_metrics_worker:get_gauges(unknown_name, unknown_id)),
%% unknown resource id
?assertEqual(#{}, emqx_metrics_worker:get_gauges(?NAME, unknown_id)),
Metric = 'queuing',
Id = <<"some id">>,
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, Metric, 2),
?assertEqual(#{queuing => 2}, emqx_metrics_worker:get_gauges(?NAME, Id)),
?assertEqual(#{}, emqx_metrics_worker:get_gauges(?NAME, unknown)),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, Metric, 3),
?assertEqual(#{queuing => 5}, emqx_metrics_worker:get_gauges(?NAME, Id)),
AnotherMetric = 'inflight',
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, Metric, 1),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, AnotherMetric, 10),
?assertEqual(#{queuing => 4, inflight => 10}, emqx_metrics_worker:get_gauges(?NAME, Id)),
ok.
t_delete_gauge(_Config) ->
%% unknown handler name (inexistent table)
?assertEqual(ok, emqx_metrics_worker:delete_gauges(unknown_name, unknown_id)),
%% unknown resource id
?assertEqual(ok, emqx_metrics_worker:delete_gauges(?NAME, unknown_id)),
Metric = 'queuing',
AnotherMetric = 'inflight',
Id = <<"some id">>,
AnotherId = <<"another id">>,
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, Metric, 2),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id1, Metric, 3),
ok = emqx_metrics_worker:set_gauge(?NAME, Id, worker_id0, AnotherMetric, 10),
ok = emqx_metrics_worker:set_gauge(?NAME, AnotherId, worker_id1, AnotherMetric, 10),
?assertEqual(#{queuing => 5, inflight => 10}, emqx_metrics_worker:get_gauges(?NAME, Id)),
?assertEqual(ok, emqx_metrics_worker:delete_gauges(?NAME, Id)),
?assertEqual(#{}, emqx_metrics_worker:get_gauges(?NAME, Id)),
?assertEqual(#{inflight => 10}, emqx_metrics_worker:get_gauges(?NAME, AnotherId)),
ok.
t_shift_gauge(_Config) ->
Metric = 'queueing',
Id = <<"some id">>,
AnotherId = <<"another id">>,
%% unknown handler name (inexistent table)
?assertEqual(
ok, emqx_metrics_worker:shift_gauge(unknown_name, unknown_id, worker_id0, Metric, 2)
),
?assertEqual(0, emqx_metrics_worker:get_gauge(unknown_name, unknown_id, Metric)),
%% empty resource id
?assertEqual(ok, emqx_metrics_worker:shift_gauge(?NAME, Id, worker_id0, Metric, 2)),
?assertEqual(ok, emqx_metrics_worker:shift_gauge(?NAME, AnotherId, worker_id0, Metric, 2)),
?assertEqual(2, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
?assertEqual(2, emqx_metrics_worker:get_gauge(?NAME, AnotherId, Metric)),
?assertEqual(ok, emqx_metrics_worker:shift_gauge(?NAME, Id, worker_id0, Metric, 3)),
?assertEqual(5, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
?assertEqual(ok, emqx_metrics_worker:shift_gauge(?NAME, Id, worker_id1, Metric, 10)),
?assertEqual(15, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
?assertEqual(ok, emqx_metrics_worker:shift_gauge(?NAME, Id, worker_id1, Metric, -4)),
?assertEqual(11, emqx_metrics_worker:get_gauge(?NAME, Id, Metric)),
?assertEqual(2, emqx_metrics_worker:get_gauge(?NAME, AnotherId, Metric)),
ok.

View File

@ -49,8 +49,7 @@ push_on_exit_callback(Server, Callback) when is_function(Callback, 0) ->
init(Parent) -> init(Parent) ->
process_flag(trap_exit, true), process_flag(trap_exit, true),
Ref = monitor(process, Parent), {ok, #{callbacks => [], owner => Parent}}.
{ok, #{callbacks => [], owner => {Ref, Parent}}}.
terminate(_Reason, #{callbacks := Callbacks}) -> terminate(_Reason, #{callbacks := Callbacks}) ->
lists:foreach(fun(Fun) -> Fun() end, Callbacks). lists:foreach(fun(Fun) -> Fun() end, Callbacks).
@ -63,7 +62,7 @@ handle_call(_Req, _From, State) ->
handle_cast(_Req, State) -> handle_cast(_Req, State) ->
{noreply, State}. {noreply, State}.
handle_info({'DOWN', Ref, process, Parent, _Reason}, State = #{owner := {Ref, Parent}}) -> handle_info({'EXIT', Parent, _Reason}, State = #{owner := Parent}) ->
{stop, normal, State}; {stop, normal, State};
handle_info(_Msg, State) -> handle_info(_Msg, State) ->
{noreply, State}. {noreply, State}.

View File

@ -195,7 +195,7 @@ emqx_bridge_schema {
metric_sent_inflight { metric_sent_inflight {
desc { desc {
en: """Count of messages that were sent asynchronously but ACKs are not received.""" en: """Count of messages that were sent asynchronously but ACKs are not yet received."""
zh: """已异步地发送但没有收到 ACK 的消息个数。""" zh: """已异步地发送但没有收到 ACK 的消息个数。"""
} }
label: { label: {

View File

@ -1,7 +1,7 @@
%% -*- mode: erlang -*- %% -*- mode: erlang -*-
{application, emqx_bridge, [ {application, emqx_bridge, [
{description, "EMQX bridges"}, {description, "EMQX bridges"},
{vsn, "0.1.7"}, {vsn, "0.1.8"},
{registered, []}, {registered, []},
{mod, {emqx_bridge_app, []}}, {mod, {emqx_bridge_app, []}},
{applications, [ {applications, [

View File

@ -686,7 +686,6 @@ format_resp(
format_metrics(#{ format_metrics(#{
counters := #{ counters := #{
'batching' := Batched,
'dropped' := Dropped, 'dropped' := Dropped,
'dropped.other' := DroppedOther, 'dropped.other' := DroppedOther,
'dropped.queue_full' := DroppedQueueFull, 'dropped.queue_full' := DroppedQueueFull,
@ -694,17 +693,19 @@ format_metrics(#{
'dropped.resource_not_found' := DroppedResourceNotFound, 'dropped.resource_not_found' := DroppedResourceNotFound,
'dropped.resource_stopped' := DroppedResourceStopped, 'dropped.resource_stopped' := DroppedResourceStopped,
'matched' := Matched, 'matched' := Matched,
'queuing' := Queued,
'retried' := Retried, 'retried' := Retried,
'failed' := SentFailed, 'failed' := SentFailed,
'inflight' := SentInflight,
'success' := SentSucc, 'success' := SentSucc,
'received' := Rcvd 'received' := Rcvd
}, },
gauges := Gauges,
rate := #{ rate := #{
matched := #{current := Rate, last5m := Rate5m, max := RateMax} matched := #{current := Rate, last5m := Rate5m, max := RateMax}
} }
}) -> }) ->
Batched = maps:get('batching', Gauges, 0),
Queued = maps:get('queuing', Gauges, 0),
SentInflight = maps:get('inflight', Gauges, 0),
?METRICS( ?METRICS(
Batched, Batched,
Dropped, Dropped,

View File

@ -19,6 +19,7 @@
-compile(export_all). -compile(export_all).
-import(emqx_dashboard_api_test_helpers, [request/4, uri/1]). -import(emqx_dashboard_api_test_helpers, [request/4, uri/1]).
-import(emqx_common_test_helpers, [on_exit/1]).
-include("emqx/include/emqx.hrl"). -include("emqx/include/emqx.hrl").
-include_lib("eunit/include/eunit.hrl"). -include_lib("eunit/include/eunit.hrl").
@ -124,6 +125,7 @@ init_per_testcase(_, Config) ->
Config. Config.
end_per_testcase(_, _Config) -> end_per_testcase(_, _Config) ->
clear_resources(), clear_resources(),
emqx_common_test_helpers:call_janitor(),
ok. ok.
clear_resources() -> clear_resources() ->
@ -672,6 +674,12 @@ t_mqtt_conn_bridge_egress_reconnect(_) ->
<<"name">> := ?BRIDGE_NAME_EGRESS <<"name">> := ?BRIDGE_NAME_EGRESS
} = jsx:decode(Bridge), } = jsx:decode(Bridge),
BridgeIDEgress = emqx_bridge_resource:bridge_id(?TYPE_MQTT, ?BRIDGE_NAME_EGRESS), BridgeIDEgress = emqx_bridge_resource:bridge_id(?TYPE_MQTT, ?BRIDGE_NAME_EGRESS),
on_exit(fun() ->
%% delete the bridge
{ok, 204, <<>>} = request(delete, uri(["bridges", BridgeIDEgress]), []),
{ok, 200, <<"[]">>} = request(get, uri(["bridges"]), []),
ok
end),
%% we now test if the bridge works as expected %% we now test if the bridge works as expected
LocalTopic = <<?EGRESS_LOCAL_TOPIC, "/1">>, LocalTopic = <<?EGRESS_LOCAL_TOPIC, "/1">>,
RemoteTopic = <<?EGRESS_REMOTE_TOPIC, "/", LocalTopic/binary>>, RemoteTopic = <<?EGRESS_REMOTE_TOPIC, "/", LocalTopic/binary>>,
@ -733,15 +741,20 @@ t_mqtt_conn_bridge_egress_reconnect(_) ->
%% verify the metrics of the bridge, the message should be queued %% verify the metrics of the bridge, the message should be queued
{ok, 200, BridgeStr1} = request(get, uri(["bridges", BridgeIDEgress]), []), {ok, 200, BridgeStr1} = request(get, uri(["bridges", BridgeIDEgress]), []),
Decoded1 = jsx:decode(BridgeStr1),
?assertMatch(
Status when (Status == <<"connected">> orelse Status == <<"connecting">>),
maps:get(<<"status">>, Decoded1)
),
%% matched >= 3 because of possible retries. %% matched >= 3 because of possible retries.
?assertMatch( ?assertMatch(
#{ #{
<<"status">> := Status, <<"matched">> := Matched,
<<"metrics">> := #{ <<"success">> := 1,
<<"matched">> := Matched, <<"success">> := 1, <<"failed">> := 0, <<"queuing">> := 2 <<"failed">> := 0,
} <<"queuing">> := 2
} when Matched >= 3 andalso (Status == <<"connected">> orelse Status == <<"connecting">>), } when Matched >= 3,
jsx:decode(BridgeStr1) maps:get(<<"metrics">>, Decoded1)
), ),
%% start the listener 1883 to make the bridge reconnected %% start the listener 1883 to make the bridge reconnected
@ -766,9 +779,6 @@ t_mqtt_conn_bridge_egress_reconnect(_) ->
%% also verify the 2 messages have been sent to the remote broker %% also verify the 2 messages have been sent to the remote broker
assert_mqtt_msg_received(RemoteTopic, Payload1), assert_mqtt_msg_received(RemoteTopic, Payload1),
assert_mqtt_msg_received(RemoteTopic, Payload2), assert_mqtt_msg_received(RemoteTopic, Payload2),
%% delete the bridge
{ok, 204, <<>>} = request(delete, uri(["bridges", BridgeIDEgress]), []),
{ok, 200, <<"[]">>} = request(get, uri(["bridges"]), []),
ok. ok.
assert_mqtt_msg_received(Topic, Payload) -> assert_mqtt_msg_received(Topic, Payload) ->

View File

@ -1,7 +1,7 @@
%% -*- mode: erlang -*- %% -*- mode: erlang -*-
{application, emqx_resource, [ {application, emqx_resource, [
{description, "Manager for all external resources"}, {description, "Manager for all external resources"},
{vsn, "0.1.3"}, {vsn, "0.1.4"},
{registered, []}, {registered, []},
{mod, {emqx_resource_app, []}}, {mod, {emqx_resource_app, []}},
{applications, [ {applications, [

View File

@ -141,9 +141,6 @@ create(MgrId, ResId, Group, ResourceType, Config, Opts) ->
'dropped.resource_not_found', 'dropped.resource_not_found',
'dropped.resource_stopped', 'dropped.resource_stopped',
'dropped.other', 'dropped.other',
'queuing',
'batching',
'inflight',
'received' 'received'
], ],
[matched] [matched]

View File

@ -24,11 +24,12 @@
]). ]).
-export([ -export([
batching_change/2, batching_set/3,
batching_shift/3,
batching_get/1, batching_get/1,
inflight_change/2, inflight_set/3,
inflight_get/1, inflight_get/1,
queuing_change/2, queuing_set/3,
queuing_get/1, queuing_get/1,
dropped_inc/1, dropped_inc/1,
dropped_inc/2, dropped_inc/2,
@ -114,8 +115,6 @@ handle_telemetry_event(
_HandlerConfig _HandlerConfig
) -> ) ->
case Event of case Event of
batching ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'batching', Val);
dropped_other -> dropped_other ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'dropped', Val), emqx_metrics_worker:inc(?RES_METRICS, ID, 'dropped', Val),
emqx_metrics_worker:inc(?RES_METRICS, ID, 'dropped.other', Val); emqx_metrics_worker:inc(?RES_METRICS, ID, 'dropped.other', Val);
@ -133,12 +132,8 @@ handle_telemetry_event(
emqx_metrics_worker:inc(?RES_METRICS, ID, 'dropped.resource_stopped', Val); emqx_metrics_worker:inc(?RES_METRICS, ID, 'dropped.resource_stopped', Val);
failed -> failed ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'failed', Val); emqx_metrics_worker:inc(?RES_METRICS, ID, 'failed', Val);
inflight ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'inflight', Val);
matched -> matched ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'matched', Val); emqx_metrics_worker:inc(?RES_METRICS, ID, 'matched', Val);
queuing ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'queuing', Val);
retried_failed -> retried_failed ->
emqx_metrics_worker:inc(?RES_METRICS, ID, 'retried', Val), emqx_metrics_worker:inc(?RES_METRICS, ID, 'retried', Val),
emqx_metrics_worker:inc(?RES_METRICS, ID, 'failed', Val), emqx_metrics_worker:inc(?RES_METRICS, ID, 'failed', Val),
@ -152,6 +147,34 @@ handle_telemetry_event(
_ -> _ ->
ok ok
end; end;
handle_telemetry_event(
[?TELEMETRY_PREFIX, Event],
_Measurements = #{gauge_set := Val},
_Metadata = #{resource_id := ID, worker_id := WorkerID},
_HandlerConfig
) ->
case Event of
batching ->
emqx_metrics_worker:set_gauge(?RES_METRICS, ID, WorkerID, 'batching', Val);
inflight ->
emqx_metrics_worker:set_gauge(?RES_METRICS, ID, WorkerID, 'inflight', Val);
queuing ->
emqx_metrics_worker:set_gauge(?RES_METRICS, ID, WorkerID, 'queuing', Val);
_ ->
ok
end;
handle_telemetry_event(
[?TELEMETRY_PREFIX, Event],
_Measurements = #{gauge_shift := Val},
_Metadata = #{resource_id := ID, worker_id := WorkerID},
_HandlerConfig
) ->
case Event of
batching ->
emqx_metrics_worker:shift_gauge(?RES_METRICS, ID, WorkerID, 'batching', Val);
_ ->
ok
end;
handle_telemetry_event(_EventName, _Measurements, _Metadata, _HandlerConfig) -> handle_telemetry_event(_EventName, _Measurements, _Metadata, _HandlerConfig) ->
ok. ok.
@ -160,26 +183,48 @@ handle_telemetry_event(_EventName, _Measurements, _Metadata, _HandlerConfig) ->
%% @doc Count of messages that are currently accumulated in memory waiting for %% @doc Count of messages that are currently accumulated in memory waiting for
%% being sent in one batch %% being sent in one batch
batching_change(ID, Val) -> batching_set(ID, WorkerID, Val) ->
telemetry:execute([?TELEMETRY_PREFIX, batching], #{counter_inc => Val}, #{resource_id => ID}). telemetry:execute(
[?TELEMETRY_PREFIX, batching],
#{gauge_set => Val},
#{resource_id => ID, worker_id => WorkerID}
).
batching_shift(_ID, _WorkerID = undefined, _Val) ->
ok;
batching_shift(ID, WorkerID, Val) ->
telemetry:execute(
[?TELEMETRY_PREFIX, batching],
#{gauge_shift => Val},
#{resource_id => ID, worker_id => WorkerID}
).
batching_get(ID) -> batching_get(ID) ->
emqx_metrics_worker:get(?RES_METRICS, ID, 'batching'). emqx_metrics_worker:get_gauge(?RES_METRICS, ID, 'batching').
%% @doc Count of messages that are currently queuing. [Gauge] %% @doc Count of batches of messages that are currently
queuing_change(ID, Val) -> %% queuing. [Gauge]
telemetry:execute([?TELEMETRY_PREFIX, queuing], #{counter_inc => Val}, #{resource_id => ID}). queuing_set(ID, WorkerID, Val) ->
telemetry:execute(
[?TELEMETRY_PREFIX, queuing],
#{gauge_set => Val},
#{resource_id => ID, worker_id => WorkerID}
).
queuing_get(ID) -> queuing_get(ID) ->
emqx_metrics_worker:get(?RES_METRICS, ID, 'queuing'). emqx_metrics_worker:get_gauge(?RES_METRICS, ID, 'queuing').
%% @doc Count of messages that were sent asynchronously but ACKs are not %% @doc Count of batches of messages that were sent asynchronously but
%% received. [Gauge] %% ACKs are not yet received. [Gauge]
inflight_change(ID, Val) -> inflight_set(ID, WorkerID, Val) ->
telemetry:execute([?TELEMETRY_PREFIX, inflight], #{counter_inc => Val}, #{resource_id => ID}). telemetry:execute(
[?TELEMETRY_PREFIX, inflight],
#{gauge_set => Val},
#{resource_id => ID, worker_id => WorkerID}
).
inflight_get(ID) -> inflight_get(ID) ->
emqx_metrics_worker:get(?RES_METRICS, ID, 'inflight'). emqx_metrics_worker:get_gauge(?RES_METRICS, ID, 'inflight').
%% Counters (value can only got up): %% Counters (value can only got up):
%% -------------------------------------- %% --------------------------------------

View File

@ -52,7 +52,7 @@
-export([queue_item_marshaller/1, estimate_size/1]). -export([queue_item_marshaller/1, estimate_size/1]).
-export([reply_after_query/6, batch_reply_after_query/6]). -export([reply_after_query/7, batch_reply_after_query/7]).
-define(Q_ITEM(REQUEST), {q_item, REQUEST}). -define(Q_ITEM(REQUEST), {q_item, REQUEST}).
@ -90,13 +90,27 @@ async_query(Id, Request, Opts) ->
%% simple query the resource without batching and queuing messages. %% simple query the resource without batching and queuing messages.
-spec simple_sync_query(id(), request()) -> Result :: term(). -spec simple_sync_query(id(), request()) -> Result :: term().
simple_sync_query(Id, Request) -> simple_sync_query(Id, Request) ->
Result = call_query(sync, Id, ?QUERY(self(), Request, false), #{}), %% Note: since calling this function implies in bypassing the
%% buffer workers, and each buffer worker index is used when
%% collecting gauge metrics, we use this dummy index. If this
%% call ends up calling buffering functions, that's a bug and
%% would mess up the metrics anyway. `undefined' is ignored by
%% `emqx_resource_metrics:*_shift/3'.
Index = undefined,
Result = call_query(sync, Id, Index, ?QUERY(self(), Request, false), #{}),
_ = handle_query_result(Id, Result, false, false), _ = handle_query_result(Id, Result, false, false),
Result. Result.
-spec simple_async_query(id(), request(), reply_fun()) -> Result :: term(). -spec simple_async_query(id(), request(), reply_fun()) -> Result :: term().
simple_async_query(Id, Request, ReplyFun) -> simple_async_query(Id, Request, ReplyFun) ->
Result = call_query(async, Id, ?QUERY(ReplyFun, Request, false), #{}), %% Note: since calling this function implies in bypassing the
%% buffer workers, and each buffer worker index is used when
%% collecting gauge metrics, we use this dummy index. If this
%% call ends up calling buffering functions, that's a bug and
%% would mess up the metrics anyway. `undefined' is ignored by
%% `emqx_resource_metrics:*_shift/3'.
Index = undefined,
Result = call_query(async, Id, Index, ?QUERY(ReplyFun, Request, false), #{}),
_ = handle_query_result(Id, Result, false, false), _ = handle_query_result(Id, Result, false, false),
Result. Result.
@ -133,9 +147,11 @@ init({Id, Index, Opts}) ->
false -> false ->
undefined undefined
end, end,
emqx_resource_metrics:queuing_change(Id, queue_count(Queue)), emqx_resource_metrics:queuing_set(Id, Index, queue_count(Queue)),
emqx_resource_metrics:batching_set(Id, Index, 0),
emqx_resource_metrics:inflight_set(Id, Index, 0),
InfltWinSZ = maps:get(async_inflight_window, Opts, ?DEFAULT_INFLIGHT), InfltWinSZ = maps:get(async_inflight_window, Opts, ?DEFAULT_INFLIGHT),
ok = inflight_new(Name, InfltWinSZ), ok = inflight_new(Name, InfltWinSZ, Id, Index),
HCItvl = maps:get(health_check_interval, Opts, ?HEALTHCHECK_INTERVAL), HCItvl = maps:get(health_check_interval, Opts, ?HEALTHCHECK_INTERVAL),
St = #{ St = #{
id => Id, id => Id,
@ -158,10 +174,12 @@ running(cast, resume, _St) ->
keep_state_and_data; keep_state_and_data;
running(cast, block, St) -> running(cast, block, St) ->
{next_state, blocked, St}; {next_state, blocked, St};
running(cast, {block, [?QUERY(_, _, _) | _] = Batch}, #{id := Id, queue := Q} = St) when running(
cast, {block, [?QUERY(_, _, _) | _] = Batch}, #{id := Id, index := Index, queue := Q} = St
) when
is_list(Batch) is_list(Batch)
-> ->
Q1 = maybe_append_queue(Id, Q, [?Q_ITEM(Query) || Query <- Batch]), Q1 = maybe_append_queue(Id, Index, Q, [?Q_ITEM(Query) || Query <- Batch]),
{next_state, blocked, St#{queue := Q1}}; {next_state, blocked, St#{queue := Q1}};
running({call, From}, {query, Request, _Opts}, St) -> running({call, From}, {query, Request, _Opts}, St) ->
query_or_acc(From, Request, St); query_or_acc(From, Request, St);
@ -180,28 +198,39 @@ blocked(enter, _, #{resume_interval := ResumeT} = _St) ->
{keep_state_and_data, {state_timeout, ResumeT, resume}}; {keep_state_and_data, {state_timeout, ResumeT, resume}};
blocked(cast, block, _St) -> blocked(cast, block, _St) ->
keep_state_and_data; keep_state_and_data;
blocked(cast, {block, [?QUERY(_, _, _) | _] = Batch}, #{id := Id, queue := Q} = St) when blocked(
cast, {block, [?QUERY(_, _, _) | _] = Batch}, #{id := Id, index := Index, queue := Q} = St
) when
is_list(Batch) is_list(Batch)
-> ->
Q1 = maybe_append_queue(Id, Q, [?Q_ITEM(Query) || Query <- Batch]), Q1 = maybe_append_queue(Id, Index, Q, [?Q_ITEM(Query) || Query <- Batch]),
{keep_state, St#{queue := Q1}}; {keep_state, St#{queue := Q1}};
blocked(cast, resume, St) -> blocked(cast, resume, St) ->
do_resume(St); do_resume(St);
blocked(state_timeout, resume, St) -> blocked(state_timeout, resume, St) ->
do_resume(St); do_resume(St);
blocked({call, From}, {query, Request, _Opts}, #{id := Id, queue := Q} = St) -> blocked({call, From}, {query, Request, _Opts}, #{id := Id, index := Index, queue := Q} = St) ->
Error = ?RESOURCE_ERROR(blocked, "resource is blocked"), Error = ?RESOURCE_ERROR(blocked, "resource is blocked"),
_ = reply_caller(Id, ?REPLY(From, Request, false, Error)), _ = reply_caller(Id, ?REPLY(From, Request, false, Error)),
{keep_state, St#{queue := maybe_append_queue(Id, Q, [?Q_ITEM(?QUERY(From, Request, false))])}}; {keep_state, St#{
blocked(cast, {query, Request, Opts}, #{id := Id, queue := Q} = St) -> queue := maybe_append_queue(Id, Index, Q, [?Q_ITEM(?QUERY(From, Request, false))])
}};
blocked(cast, {query, Request, Opts}, #{id := Id, index := Index, queue := Q} = St) ->
ReplyFun = maps:get(async_reply_fun, Opts, undefined), ReplyFun = maps:get(async_reply_fun, Opts, undefined),
Error = ?RESOURCE_ERROR(blocked, "resource is blocked"), Error = ?RESOURCE_ERROR(blocked, "resource is blocked"),
_ = reply_caller(Id, ?REPLY(ReplyFun, Request, false, Error)), _ = reply_caller(Id, ?REPLY(ReplyFun, Request, false, Error)),
{keep_state, St#{ {keep_state, St#{
queue := maybe_append_queue(Id, Q, [?Q_ITEM(?QUERY(ReplyFun, Request, false))]) queue := maybe_append_queue(Id, Index, Q, [?Q_ITEM(?QUERY(ReplyFun, Request, false))])
}}. }}.
terminate(_Reason, #{id := Id, index := Index}) -> terminate(_Reason, #{id := Id, index := Index, queue := Q}) ->
GaugeFns =
[
fun emqx_resource_metrics:batching_set/3,
fun emqx_resource_metrics:inflight_set/3
],
lists:foreach(fun(Fn) -> Fn(Id, Index, 0) end, GaugeFns),
emqx_resource_metrics:queuing_set(Id, Index, queue_count(Q)),
gproc_pool:disconnect_worker(Id, {Id, Index}). gproc_pool:disconnect_worker(Id, {Id, Index}).
code_change(_OldVsn, State, _Extra) -> code_change(_OldVsn, State, _Extra) ->
@ -240,24 +269,33 @@ do_resume(#{id := Id, name := Name} = St) ->
retry_queue(#{queue := undefined} = St) -> retry_queue(#{queue := undefined} = St) ->
{next_state, running, St}; {next_state, running, St};
retry_queue(#{queue := Q, id := Id, enable_batch := false, resume_interval := ResumeT} = St) -> retry_queue(
#{
queue := Q,
id := Id,
index := Index,
enable_batch := false,
resume_interval := ResumeT
} = St
) ->
case get_first_n_from_queue(Q, 1) of case get_first_n_from_queue(Q, 1) of
[] -> [] ->
{next_state, running, St}; {next_state, running, St};
[?QUERY(_, Request, HasSent) = Query] -> [?QUERY(_, Request, HasSent) = Query] ->
QueryOpts = #{inflight_name => maps:get(name, St)}, QueryOpts = #{inflight_name => maps:get(name, St)},
Result = call_query(configured, Id, Query, QueryOpts), Result = call_query(configured, Id, Index, Query, QueryOpts),
case reply_caller(Id, ?REPLY(undefined, Request, HasSent, Result)) of case reply_caller(Id, ?REPLY(undefined, Request, HasSent, Result)) of
true -> true ->
{keep_state, St, {state_timeout, ResumeT, resume}}; {keep_state, St, {state_timeout, ResumeT, resume}};
false -> false ->
retry_queue(St#{queue := drop_head(Q, Id)}) retry_queue(St#{queue := drop_head(Q, Id, Index)})
end end
end; end;
retry_queue( retry_queue(
#{ #{
queue := Q, queue := Q,
id := Id, id := Id,
index := Index,
enable_batch := true, enable_batch := true,
batch_size := BatchSize, batch_size := BatchSize,
resume_interval := ResumeT resume_interval := ResumeT
@ -268,7 +306,7 @@ retry_queue(
{next_state, running, St}; {next_state, running, St};
Batch0 -> Batch0 ->
QueryOpts = #{inflight_name => maps:get(name, St)}, QueryOpts = #{inflight_name => maps:get(name, St)},
Result = call_query(configured, Id, Batch0, QueryOpts), Result = call_query(configured, Id, Index, Batch0, QueryOpts),
%% The caller has been replied with ?RESOURCE_ERROR(blocked, _) before saving into the queue, %% The caller has been replied with ?RESOURCE_ERROR(blocked, _) before saving into the queue,
%% we now change the 'from' field to 'undefined' so it will not reply the caller again. %% we now change the 'from' field to 'undefined' so it will not reply the caller again.
Batch = [?QUERY(undefined, Request, HasSent) || ?QUERY(_, Request, HasSent) <- Batch0], Batch = [?QUERY(undefined, Request, HasSent) || ?QUERY(_, Request, HasSent) <- Batch0],
@ -276,41 +314,55 @@ retry_queue(
true -> true ->
{keep_state, St, {state_timeout, ResumeT, resume}}; {keep_state, St, {state_timeout, ResumeT, resume}};
false -> false ->
retry_queue(St#{queue := drop_first_n_from_queue(Q, length(Batch), Id)}) retry_queue(St#{queue := drop_first_n_from_queue(Q, length(Batch), Id, Index)})
end end
end. end.
retry_inflight_sync( retry_inflight_sync(
Id, Ref, ?QUERY(_, _, HasSent) = Query, Name, #{resume_interval := ResumeT} = St0 Id,
Ref,
?QUERY(_, _, HasSent) = Query,
Name,
#{index := Index, resume_interval := ResumeT} = St0
) -> ) ->
Result = call_query(sync, Id, Query, #{}), Result = call_query(sync, Id, Index, Query, #{}),
case handle_query_result(Id, Result, HasSent, false) of case handle_query_result(Id, Result, HasSent, false) of
%% Send failed because resource down %% Send failed because resource down
true -> true ->
{keep_state, St0, {state_timeout, ResumeT, resume}}; {keep_state, St0, {state_timeout, ResumeT, resume}};
%% Send ok or failed but the resource is working %% Send ok or failed but the resource is working
false -> false ->
inflight_drop(Name, Ref), inflight_drop(Name, Ref, Id, Index),
do_resume(St0) do_resume(St0)
end. end.
query_or_acc(From, Request, #{enable_batch := true, acc := Acc, acc_left := Left, id := Id} = St0) -> query_or_acc(
From,
Request,
#{
enable_batch := true,
acc := Acc,
acc_left := Left,
index := Index,
id := Id
} = St0
) ->
Acc1 = [?QUERY(From, Request, false) | Acc], Acc1 = [?QUERY(From, Request, false) | Acc],
emqx_resource_metrics:batching_change(Id, 1), emqx_resource_metrics:batching_shift(Id, Index, 1),
St = St0#{acc := Acc1, acc_left := Left - 1}, St = St0#{acc := Acc1, acc_left := Left - 1},
case Left =< 1 of case Left =< 1 of
true -> flush(St); true -> flush(St);
false -> {keep_state, ensure_flush_timer(St)} false -> {keep_state, ensure_flush_timer(St)}
end; end;
query_or_acc(From, Request, #{enable_batch := false, queue := Q, id := Id} = St) -> query_or_acc(From, Request, #{enable_batch := false, queue := Q, id := Id, index := Index} = St) ->
QueryOpts = #{ QueryOpts = #{
inflight_name => maps:get(name, St) inflight_name => maps:get(name, St)
}, },
Result = call_query(configured, Id, ?QUERY(From, Request, false), QueryOpts), Result = call_query(configured, Id, Index, ?QUERY(From, Request, false), QueryOpts),
case reply_caller(Id, ?REPLY(From, Request, false, Result)) of case reply_caller(Id, ?REPLY(From, Request, false, Result)) of
true -> true ->
Query = ?QUERY(From, Request, false), Query = ?QUERY(From, Request, false),
{next_state, blocked, St#{queue := maybe_append_queue(Id, Q, [?Q_ITEM(Query)])}}; {next_state, blocked, St#{queue := maybe_append_queue(Id, Index, Q, [?Q_ITEM(Query)])}};
false -> false ->
{keep_state, St} {keep_state, St}
end. end.
@ -320,6 +372,7 @@ flush(#{acc := []} = St) ->
flush( flush(
#{ #{
id := Id, id := Id,
index := Index,
acc := Batch0, acc := Batch0,
batch_size := Size, batch_size := Size,
queue := Q0 queue := Q0
@ -329,12 +382,12 @@ flush(
QueryOpts = #{ QueryOpts = #{
inflight_name => maps:get(name, St) inflight_name => maps:get(name, St)
}, },
emqx_resource_metrics:batching_change(Id, -length(Batch)), emqx_resource_metrics:batching_shift(Id, Index, -length(Batch)),
Result = call_query(configured, Id, Batch, QueryOpts), Result = call_query(configured, Id, Index, Batch, QueryOpts),
St1 = cancel_flush_timer(St#{acc_left := Size, acc := []}), St1 = cancel_flush_timer(St#{acc_left := Size, acc := []}),
case batch_reply_caller(Id, Result, Batch) of case batch_reply_caller(Id, Result, Batch) of
true -> true ->
Q1 = maybe_append_queue(Id, Q0, [?Q_ITEM(Query) || Query <- Batch]), Q1 = maybe_append_queue(Id, Index, Q0, [?Q_ITEM(Query) || Query <- Batch]),
{next_state, blocked, St1#{queue := Q1}}; {next_state, blocked, St1#{queue := Q1}};
false -> false ->
{keep_state, St1} {keep_state, St1}
@ -412,7 +465,7 @@ handle_query_result(Id, Result, HasSent, BlockWorker) ->
inc_sent_success(Id, HasSent), inc_sent_success(Id, HasSent),
BlockWorker. BlockWorker.
call_query(QM0, Id, Query, QueryOpts) -> call_query(QM0, Id, Index, Query, QueryOpts) ->
?tp(call_query_enter, #{id => Id, query => Query}), ?tp(call_query_enter, #{id => Id, query => Query}),
case emqx_resource_manager:ets_lookup(Id) of case emqx_resource_manager:ets_lookup(Id) of
{ok, _Group, #{mod := Mod, state := ResSt, status := connected} = Data} -> {ok, _Group, #{mod := Mod, state := ResSt, status := connected} = Data} ->
@ -423,7 +476,7 @@ call_query(QM0, Id, Query, QueryOpts) ->
end, end,
CM = maps:get(callback_mode, Data), CM = maps:get(callback_mode, Data),
emqx_resource_metrics:matched_inc(Id), emqx_resource_metrics:matched_inc(Id),
apply_query_fun(call_mode(QM, CM), Mod, Id, Query, ResSt, QueryOpts); apply_query_fun(call_mode(QM, CM), Mod, Id, Index, Query, ResSt, QueryOpts);
{ok, _Group, #{status := stopped}} -> {ok, _Group, #{status := stopped}} ->
emqx_resource_metrics:matched_inc(Id), emqx_resource_metrics:matched_inc(Id),
?RESOURCE_ERROR(stopped, "resource stopped or disabled"); ?RESOURCE_ERROR(stopped, "resource stopped or disabled");
@ -452,10 +505,10 @@ call_query(QM0, Id, Query, QueryOpts) ->
end end
). ).
apply_query_fun(sync, Mod, Id, ?QUERY(_, Request, _) = _Query, ResSt, _QueryOpts) -> apply_query_fun(sync, Mod, Id, _Index, ?QUERY(_, Request, _) = _Query, ResSt, _QueryOpts) ->
?tp(call_query, #{id => Id, mod => Mod, query => _Query, res_st => ResSt}), ?tp(call_query, #{id => Id, mod => Mod, query => _Query, res_st => ResSt}),
?APPLY_RESOURCE(call_query, Mod:on_query(Id, Request, ResSt), Request); ?APPLY_RESOURCE(call_query, Mod:on_query(Id, Request, ResSt), Request);
apply_query_fun(async, Mod, Id, ?QUERY(_, Request, _) = Query, ResSt, QueryOpts) -> apply_query_fun(async, Mod, Id, Index, ?QUERY(_, Request, _) = Query, ResSt, QueryOpts) ->
?tp(call_query_async, #{id => Id, mod => Mod, query => Query, res_st => ResSt}), ?tp(call_query_async, #{id => Id, mod => Mod, query => Query, res_st => ResSt}),
Name = maps:get(inflight_name, QueryOpts, undefined), Name = maps:get(inflight_name, QueryOpts, undefined),
?APPLY_RESOURCE( ?APPLY_RESOURCE(
@ -464,21 +517,20 @@ apply_query_fun(async, Mod, Id, ?QUERY(_, Request, _) = Query, ResSt, QueryOpts)
true -> true ->
{async_return, inflight_full}; {async_return, inflight_full};
false -> false ->
ok = emqx_resource_metrics:inflight_change(Id, 1), ReplyFun = fun ?MODULE:reply_after_query/7,
ReplyFun = fun ?MODULE:reply_after_query/6,
Ref = make_message_ref(), Ref = make_message_ref(),
Args = [self(), Id, Name, Ref, Query], Args = [self(), Id, Index, Name, Ref, Query],
ok = inflight_append(Name, Ref, Query), ok = inflight_append(Name, Ref, Query, Id, Index),
Result = Mod:on_query_async(Id, Request, {ReplyFun, Args}, ResSt), Result = Mod:on_query_async(Id, Request, {ReplyFun, Args}, ResSt),
{async_return, Result} {async_return, Result}
end, end,
Request Request
); );
apply_query_fun(sync, Mod, Id, [?QUERY(_, _, _) | _] = Batch, ResSt, _QueryOpts) -> apply_query_fun(sync, Mod, Id, _Index, [?QUERY(_, _, _) | _] = Batch, ResSt, _QueryOpts) ->
?tp(call_batch_query, #{id => Id, mod => Mod, batch => Batch, res_st => ResSt}), ?tp(call_batch_query, #{id => Id, mod => Mod, batch => Batch, res_st => ResSt}),
Requests = [Request || ?QUERY(_From, Request, _) <- Batch], Requests = [Request || ?QUERY(_From, Request, _) <- Batch],
?APPLY_RESOURCE(call_batch_query, Mod:on_batch_query(Id, Requests, ResSt), Batch); ?APPLY_RESOURCE(call_batch_query, Mod:on_batch_query(Id, Requests, ResSt), Batch);
apply_query_fun(async, Mod, Id, [?QUERY(_, _, _) | _] = Batch, ResSt, QueryOpts) -> apply_query_fun(async, Mod, Id, Index, [?QUERY(_, _, _) | _] = Batch, ResSt, QueryOpts) ->
?tp(call_batch_query_async, #{id => Id, mod => Mod, batch => Batch, res_st => ResSt}), ?tp(call_batch_query_async, #{id => Id, mod => Mod, batch => Batch, res_st => ResSt}),
Name = maps:get(inflight_name, QueryOpts, undefined), Name = maps:get(inflight_name, QueryOpts, undefined),
?APPLY_RESOURCE( ?APPLY_RESOURCE(
@ -487,55 +539,46 @@ apply_query_fun(async, Mod, Id, [?QUERY(_, _, _) | _] = Batch, ResSt, QueryOpts)
true -> true ->
{async_return, inflight_full}; {async_return, inflight_full};
false -> false ->
BatchLen = length(Batch), ReplyFun = fun ?MODULE:batch_reply_after_query/7,
ok = emqx_resource_metrics:inflight_change(Id, BatchLen),
ReplyFun = fun ?MODULE:batch_reply_after_query/6,
Ref = make_message_ref(), Ref = make_message_ref(),
Args = {ReplyFun, [self(), Id, Name, Ref, Batch]}, Args = {ReplyFun, [self(), Id, Index, Name, Ref, Batch]},
Requests = [Request || ?QUERY(_From, Request, _) <- Batch], Requests = [Request || ?QUERY(_From, Request, _) <- Batch],
ok = inflight_append(Name, Ref, Batch), ok = inflight_append(Name, Ref, Batch, Id, Index),
Result = Mod:on_batch_query_async(Id, Requests, Args, ResSt), Result = Mod:on_batch_query_async(Id, Requests, Args, ResSt),
{async_return, Result} {async_return, Result}
end, end,
Batch Batch
). ).
reply_after_query(Pid, Id, Name, Ref, ?QUERY(From, Request, HasSent), Result) -> reply_after_query(Pid, Id, Index, Name, Ref, ?QUERY(From, Request, HasSent), Result) ->
%% NOTE: 'inflight' is message count that sent async but no ACK received, %% NOTE: 'inflight' is the count of messages that were sent async
%% NOT the message number ququed in the inflight window. %% but received no ACK, NOT the number of messages queued in the
emqx_resource_metrics:inflight_change(Id, -1), %% inflight window.
case reply_caller(Id, ?REPLY(From, Request, HasSent, Result)) of case reply_caller(Id, ?REPLY(From, Request, HasSent, Result)) of
true -> true ->
%% we marked these messages are 'queuing' although they are actually
%% keeped in inflight window, not replayq
emqx_resource_metrics:queuing_change(Id, 1),
?MODULE:block(Pid); ?MODULE:block(Pid);
false -> false ->
drop_inflight_and_resume(Pid, Name, Ref) drop_inflight_and_resume(Pid, Name, Ref, Id, Index)
end. end.
batch_reply_after_query(Pid, Id, Name, Ref, Batch, Result) -> batch_reply_after_query(Pid, Id, Index, Name, Ref, Batch, Result) ->
%% NOTE: 'inflight' is message count that sent async but no ACK received, %% NOTE: 'inflight' is the count of messages that were sent async
%% NOT the message number ququed in the inflight window. %% but received no ACK, NOT the number of messages queued in the
BatchLen = length(Batch), %% inflight window.
emqx_resource_metrics:inflight_change(Id, -BatchLen),
case batch_reply_caller(Id, Result, Batch) of case batch_reply_caller(Id, Result, Batch) of
true -> true ->
%% we marked these messages are 'queuing' although they are actually
%% kept in inflight window, not replayq
emqx_resource_metrics:queuing_change(Id, BatchLen),
?MODULE:block(Pid); ?MODULE:block(Pid);
false -> false ->
drop_inflight_and_resume(Pid, Name, Ref) drop_inflight_and_resume(Pid, Name, Ref, Id, Index)
end. end.
drop_inflight_and_resume(Pid, Name, Ref) -> drop_inflight_and_resume(Pid, Name, Ref, Id, Index) ->
case inflight_is_full(Name) of case inflight_is_full(Name) of
true -> true ->
inflight_drop(Name, Ref), inflight_drop(Name, Ref, Id, Index),
?MODULE:resume(Pid); ?MODULE:resume(Pid);
false -> false ->
inflight_drop(Name, Ref) inflight_drop(Name, Ref, Id, Index)
end. end.
%%============================================================================== %%==============================================================================
@ -548,10 +591,10 @@ queue_item_marshaller(Bin) when is_binary(Bin) ->
estimate_size(QItem) -> estimate_size(QItem) ->
size(queue_item_marshaller(QItem)). size(queue_item_marshaller(QItem)).
maybe_append_queue(Id, undefined, _Items) -> maybe_append_queue(Id, _Index, undefined, _Items) ->
emqx_resource_metrics:dropped_queue_not_enabled_inc(Id), emqx_resource_metrics:dropped_queue_not_enabled_inc(Id),
undefined; undefined;
maybe_append_queue(Id, Q, Items) -> maybe_append_queue(Id, Index, Q, Items) ->
Q2 = Q2 =
case replayq:overflow(Q) of case replayq:overflow(Q) of
Overflow when Overflow =< 0 -> Overflow when Overflow =< 0 ->
@ -561,13 +604,13 @@ maybe_append_queue(Id, Q, Items) ->
{Q1, QAckRef, Items2} = replayq:pop(Q, PopOpts), {Q1, QAckRef, Items2} = replayq:pop(Q, PopOpts),
ok = replayq:ack(Q1, QAckRef), ok = replayq:ack(Q1, QAckRef),
Dropped = length(Items2), Dropped = length(Items2),
emqx_resource_metrics:queuing_change(Id, -Dropped),
emqx_resource_metrics:dropped_queue_full_inc(Id), emqx_resource_metrics:dropped_queue_full_inc(Id),
?SLOG(error, #{msg => drop_query, reason => queue_full, dropped => Dropped}), ?SLOG(error, #{msg => drop_query, reason => queue_full, dropped => Dropped}),
Q1 Q1
end, end,
emqx_resource_metrics:queuing_change(Id, 1), Q3 = replayq:append(Q2, Items),
replayq:append(Q2, Items). emqx_resource_metrics:queuing_set(Id, Index, replayq:count(Q3)),
Q3.
get_first_n_from_queue(Q, N) -> get_first_n_from_queue(Q, N) ->
get_first_n_from_queue(Q, N, []). get_first_n_from_queue(Q, N, []).
@ -580,23 +623,23 @@ get_first_n_from_queue(Q, N, Acc) when N > 0 ->
?Q_ITEM(Query) -> get_first_n_from_queue(Q, N - 1, [Query | Acc]) ?Q_ITEM(Query) -> get_first_n_from_queue(Q, N - 1, [Query | Acc])
end. end.
drop_first_n_from_queue(Q, 0, _Id) -> drop_first_n_from_queue(Q, 0, _Id, _Index) ->
Q; Q;
drop_first_n_from_queue(Q, N, Id) when N > 0 -> drop_first_n_from_queue(Q, N, Id, Index) when N > 0 ->
drop_first_n_from_queue(drop_head(Q, Id), N - 1, Id). drop_first_n_from_queue(drop_head(Q, Id, Index), N - 1, Id, Index).
drop_head(Q, Id) -> drop_head(Q, Id, Index) ->
{Q1, AckRef, _} = replayq:pop(Q, #{count_limit => 1}), {NewQ, AckRef, _} = replayq:pop(Q, #{count_limit => 1}),
ok = replayq:ack(Q1, AckRef), ok = replayq:ack(NewQ, AckRef),
emqx_resource_metrics:queuing_change(Id, -1), emqx_resource_metrics:queuing_set(Id, Index, replayq:count(NewQ)),
Q1. NewQ.
%%============================================================================== %%==============================================================================
%% the inflight queue for async query %% the inflight queue for async query
-define(SIZE_REF, -1). -define(SIZE_REF, -1).
inflight_new(Name, InfltWinSZ) -> inflight_new(Name, InfltWinSZ, Id, Index) ->
_ = ets:new(Name, [named_table, ordered_set, public, {write_concurrency, true}]), _ = ets:new(Name, [named_table, ordered_set, public, {write_concurrency, true}]),
inflight_append(Name, ?SIZE_REF, {max_size, InfltWinSZ}), inflight_append(Name, ?SIZE_REF, {max_size, InfltWinSZ}, Id, Index),
ok. ok.
inflight_get_first(Name) -> inflight_get_first(Name) ->
@ -617,27 +660,39 @@ inflight_is_full(undefined) ->
false; false;
inflight_is_full(Name) -> inflight_is_full(Name) ->
[{_, {max_size, MaxSize}}] = ets:lookup(Name, ?SIZE_REF), [{_, {max_size, MaxSize}}] = ets:lookup(Name, ?SIZE_REF),
Size = inflight_size(Name),
Size >= MaxSize.
inflight_size(Name) ->
%% Note: we subtract 1 because there's a metadata row that hold
%% the maximum size value.
MetadataRowCount = 1,
case ets:info(Name, size) of case ets:info(Name, size) of
Size when Size > MaxSize -> true; undefined -> 0;
_ -> false Size -> max(0, Size - MetadataRowCount)
end. end.
inflight_append(undefined, _Ref, _Query) -> inflight_append(undefined, _Ref, _Query, _Id, _Index) ->
ok; ok;
inflight_append(Name, Ref, [?QUERY(_, _, _) | _] = Batch) -> inflight_append(Name, Ref, [?QUERY(_, _, _) | _] = Batch, Id, Index) ->
ets:insert(Name, {Ref, [?QUERY(From, Req, true) || ?QUERY(From, Req, _) <- Batch]}), ets:insert(Name, {Ref, [?QUERY(From, Req, true) || ?QUERY(From, Req, _) <- Batch]}),
emqx_resource_metrics:inflight_set(Id, Index, inflight_size(Name)),
ok; ok;
inflight_append(Name, Ref, ?QUERY(From, Req, _)) -> inflight_append(Name, Ref, ?QUERY(From, Req, _), Id, Index) ->
ets:insert(Name, {Ref, ?QUERY(From, Req, true)}), ets:insert(Name, {Ref, ?QUERY(From, Req, true)}),
emqx_resource_metrics:inflight_set(Id, Index, inflight_size(Name)),
ok; ok;
inflight_append(Name, Ref, Data) -> inflight_append(Name, Ref, Data, _Id, _Index) ->
ets:insert(Name, {Ref, Data}), ets:insert(Name, {Ref, Data}),
%% this is a metadata row being inserted; therefore, we don't bump
%% the inflight metric.
ok. ok.
inflight_drop(undefined, _) -> inflight_drop(undefined, _, _Id, _Index) ->
ok; ok;
inflight_drop(Name, Ref) -> inflight_drop(Name, Ref, Id, Index) ->
ets:delete(Name, Ref), ets:delete(Name, Ref),
emqx_resource_metrics:inflight_set(Id, Index, inflight_size(Name)),
ok. ok.
%%============================================================================== %%==============================================================================

View File

@ -39,6 +39,7 @@ groups() ->
init_per_testcase(_, Config) -> init_per_testcase(_, Config) ->
emqx_connector_demo:set_callback_mode(always_sync), emqx_connector_demo:set_callback_mode(always_sync),
Config. Config.
end_per_testcase(_, _Config) -> end_per_testcase(_, _Config) ->
_ = emqx_resource:remove(?ID). _ = emqx_resource:remove(?ID).
@ -503,7 +504,10 @@ t_stop_start(_) ->
), ),
%% add some metrics to test their persistence %% add some metrics to test their persistence
emqx_resource_metrics:batching_change(?ID, 5), WorkerID0 = <<"worker:0">>,
WorkerID1 = <<"worker:1">>,
emqx_resource_metrics:batching_set(?ID, WorkerID0, 2),
emqx_resource_metrics:batching_set(?ID, WorkerID1, 3),
?assertEqual(5, emqx_resource_metrics:batching_get(?ID)), ?assertEqual(5, emqx_resource_metrics:batching_get(?ID)),
{ok, _} = emqx_resource:check_and_recreate( {ok, _} = emqx_resource:check_and_recreate(
@ -537,7 +541,8 @@ t_stop_start(_) ->
?assert(is_process_alive(Pid1)), ?assert(is_process_alive(Pid1)),
%% now stop while resetting the metrics %% now stop while resetting the metrics
emqx_resource_metrics:batching_change(?ID, 5), emqx_resource_metrics:batching_set(?ID, WorkerID0, 1),
emqx_resource_metrics:batching_set(?ID, WorkerID1, 4),
?assertEqual(5, emqx_resource_metrics:batching_get(?ID)), ?assertEqual(5, emqx_resource_metrics:batching_get(?ID)),
ok = emqx_resource:stop(?ID), ok = emqx_resource:stop(?ID),
?assertEqual(0, emqx_resource_metrics:batching_get(?ID)), ?assertEqual(0, emqx_resource_metrics:batching_get(?ID)),

View File

@ -1,9 +1,9 @@
{erl_opts, [debug_info]}. {erl_opts, [debug_info]}.
{deps, [ {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.33.0"}}} {deps, [ {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.33.0"}}}
, {wolff, {git, "https://github.com/kafka4beam/wolff.git", {tag, "1.7.0"}}} , {wolff, {git, "https://github.com/kafka4beam/wolff.git", {tag, "1.7.4"}}}
, {kafka_protocol, {git, "https://github.com/kafka4beam/kafka_protocol.git", {tag, "4.1.0"}}} , {kafka_protocol, {git, "https://github.com/kafka4beam/kafka_protocol.git", {tag, "4.1.2"}}}
, {brod_gssapi, {git, "https://github.com/kafka4beam/brod_gssapi.git", {tag, "v0.1.0-rc1"}}} , {brod_gssapi, {git, "https://github.com/kafka4beam/brod_gssapi.git", {tag, "v0.1.0-rc1"}}}
, {brod, {git, "https://github.com/kafka4beam/brod.git", {tag, "3.16.4"}}} , {brod, {git, "https://github.com/kafka4beam/brod.git", {tag, "3.16.7"}}}
, {emqx_connector, {path, "../../apps/emqx_connector"}} , {emqx_connector, {path, "../../apps/emqx_connector"}}
, {emqx_resource, {path, "../../apps/emqx_resource"}} , {emqx_resource, {path, "../../apps/emqx_resource"}}
, {emqx_bridge, {path, "../../apps/emqx_bridge"}} , {emqx_bridge, {path, "../../apps/emqx_bridge"}}

View File

@ -33,7 +33,10 @@ on_start(InstId, Config) ->
authentication := Auth, authentication := Auth,
ssl := SSL ssl := SSL
} = Config, } = Config,
_ = maybe_install_wolff_telemetry_handlers(InstId), %% TODO: change this to `kafka_producer` after refactoring for kafka_consumer
BridgeType = kafka,
ResourceID = emqx_bridge_resource:resource_id(BridgeType, BridgeName),
_ = maybe_install_wolff_telemetry_handlers(InstId, ResourceID),
%% it's a bug if producer config is not found %% it's a bug if producer config is not found
%% the caller should not try to start a producer if %% the caller should not try to start a producer if
%% there is no producer config %% there is no producer config
@ -137,7 +140,7 @@ on_query(_InstId, {send_message, Message}, #{message_template := Template, produ
%% If the producer process is down when sending, this function would %% If the producer process is down when sending, this function would
%% raise an error exception which is to be caught by the caller of this callback %% raise an error exception which is to be caught by the caller of this callback
{_Partition, _Pid} = wolff:send(Producers, [KafkaMessage], {fun ?MODULE:on_kafka_ack/3, [#{}]}), {_Partition, _Pid} = wolff:send(Producers, [KafkaMessage], {fun ?MODULE:on_kafka_ack/3, [#{}]}),
ok. {async_return, ok}.
compile_message_template(#{ compile_message_template(#{
key := KeyTemplate, value := ValueTemplate, timestamp := TimestampTemplate key := KeyTemplate, value := ValueTemplate, timestamp := TimestampTemplate
@ -304,62 +307,79 @@ get_required(Field, Config, Throw) ->
Value =:= none andalso throw(Throw), Value =:= none andalso throw(Throw),
Value. Value.
%% we *must* match the bridge id in the event metadata with that in
%% the handler config; otherwise, multiple kafka producer bridges will
%% install multiple handlers to the same wolff events, multiplying the
handle_telemetry_event( handle_telemetry_event(
[wolff, dropped], [wolff, dropped],
#{counter_inc := Val}, #{counter_inc := Val},
#{bridge_id := ID}, #{bridge_id := ID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:dropped_inc(ID, Val); emqx_resource_metrics:dropped_inc(ID, Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, dropped_queue_full], [wolff, dropped_queue_full],
#{counter_inc := Val}, #{counter_inc := Val},
#{bridge_id := ID}, #{bridge_id := ID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:dropped_queue_full_inc(ID, Val); %% When wolff emits a `dropped_queue_full' event due to replayq
%% overflow, it also emits a `dropped' event (at the time of
%% writing, wolff is 1.7.4). Since we already bump `dropped' when
%% `dropped.queue_full' occurs, we have to correct it here. This
%% correction will have to be dropped if wolff stops also emitting
%% `dropped'.
emqx_resource_metrics:dropped_queue_full_inc(ID, Val),
emqx_resource_metrics:dropped_inc(ID, -Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, queuing], [wolff, queuing],
#{counter_inc := Val}, #{gauge_set := Val},
#{bridge_id := ID}, #{bridge_id := ID, partition_id := PartitionID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:queuing_change(ID, Val); emqx_resource_metrics:queuing_set(ID, PartitionID, Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, retried], [wolff, retried],
#{counter_inc := Val}, #{counter_inc := Val},
#{bridge_id := ID}, #{bridge_id := ID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:retried_inc(ID, Val); emqx_resource_metrics:retried_inc(ID, Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, failed], [wolff, failed],
#{counter_inc := Val}, #{counter_inc := Val},
#{bridge_id := ID}, #{bridge_id := ID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:failed_inc(ID, Val); emqx_resource_metrics:failed_inc(ID, Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, inflight], [wolff, inflight],
#{counter_inc := Val}, #{gauge_set := Val},
#{bridge_id := ID}, #{bridge_id := ID, partition_id := PartitionID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:inflight_change(ID, Val); emqx_resource_metrics:inflight_set(ID, PartitionID, Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, retried_failed], [wolff, retried_failed],
#{counter_inc := Val}, #{counter_inc := Val},
#{bridge_id := ID}, #{bridge_id := ID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:retried_failed_inc(ID, Val); emqx_resource_metrics:retried_failed_inc(ID, Val);
handle_telemetry_event( handle_telemetry_event(
[wolff, retried_success], [wolff, retried_success],
#{counter_inc := Val}, #{counter_inc := Val},
#{bridge_id := ID}, #{bridge_id := ID},
_HandlerConfig #{bridge_id := ID}
) when is_integer(Val) -> ) when is_integer(Val) ->
emqx_resource_metrics:retried_success_inc(ID, Val); emqx_resource_metrics:retried_success_inc(ID, Val);
handle_telemetry_event(
[wolff, success],
#{counter_inc := Val},
#{bridge_id := ID},
#{bridge_id := ID}
) when is_integer(Val) ->
emqx_resource_metrics:success_inc(ID, Val);
handle_telemetry_event(_EventId, _Metrics, _MetaData, _HandlerConfig) -> handle_telemetry_event(_EventId, _Metrics, _MetaData, _HandlerConfig) ->
%% Event that we do not handle %% Event that we do not handle
ok. ok.
@ -372,17 +392,12 @@ uninstall_telemetry_handlers(InstanceID) ->
HandlerID = telemetry_handler_id(InstanceID), HandlerID = telemetry_handler_id(InstanceID),
telemetry:detach(HandlerID). telemetry:detach(HandlerID).
maybe_install_wolff_telemetry_handlers(InstanceID) -> maybe_install_wolff_telemetry_handlers(InstanceID, ResourceID) ->
%% Attach event handlers for Kafka telemetry events. If a handler with the %% Attach event handlers for Kafka telemetry events. If a handler with the
%% handler id already exists, the attach_many function does nothing %% handler id already exists, the attach_many function does nothing
telemetry:attach_many( telemetry:attach_many(
%% unique handler id %% unique handler id
telemetry_handler_id(InstanceID), telemetry_handler_id(InstanceID),
%% Note: we don't handle `[wolff, success]' because,
%% currently, we already increment the success counter for
%% this resource at `emqx_rule_runtime:handle_action' when
%% the response is `ok' and we would double increment it
%% here.
[ [
[wolff, dropped], [wolff, dropped],
[wolff, dropped_queue_full], [wolff, dropped_queue_full],
@ -391,8 +406,13 @@ maybe_install_wolff_telemetry_handlers(InstanceID) ->
[wolff, failed], [wolff, failed],
[wolff, inflight], [wolff, inflight],
[wolff, retried_failed], [wolff, retried_failed],
[wolff, retried_success] [wolff, retried_success],
[wolff, success]
], ],
fun ?MODULE:handle_telemetry_event/4, fun ?MODULE:handle_telemetry_event/4,
[] %% we *must* keep track of the same id that is handed down to
%% wolff producers; otherwise, multiple kafka producer bridges
%% will install multiple handlers to the same wolff events,
%% multiplying the metric counts...
#{bridge_id => ResourceID}
). ).

View File

@ -390,7 +390,7 @@ t_failed_creation_then_fix(_Config) ->
}, },
{ok, Offset} = resolve_kafka_offset(kafka_hosts(), KafkaTopic, 0), {ok, Offset} = resolve_kafka_offset(kafka_hosts(), KafkaTopic, 0),
ct:pal("base offset before testing ~p", [Offset]), ct:pal("base offset before testing ~p", [Offset]),
?assertEqual(ok, ?PRODUCER:on_query(ResourceId, {send_message, Msg}, State)), ?assertEqual({async_return, ok}, ?PRODUCER:on_query(ResourceId, {send_message, Msg}, State)),
{ok, {_, [KafkaMsg]}} = brod:fetch(kafka_hosts(), KafkaTopic, 0, Offset), {ok, {_, [KafkaMsg]}} = brod:fetch(kafka_hosts(), KafkaTopic, 0, Offset),
?assertMatch(#kafka_message{key = BinTime}, KafkaMsg), ?assertMatch(#kafka_message{key = BinTime}, KafkaMsg),
%% TODO: refactor those into init/end per testcase %% TODO: refactor those into init/end per testcase
@ -455,7 +455,7 @@ publish_helper(#{
StartRes = ?PRODUCER:on_start(InstId, Conf), StartRes = ?PRODUCER:on_start(InstId, Conf),
{ok, State} = StartRes, {ok, State} = StartRes,
OnQueryRes = ?PRODUCER:on_query(InstId, {send_message, Msg}, State), OnQueryRes = ?PRODUCER:on_query(InstId, {send_message, Msg}, State),
ok = OnQueryRes, {async_return, ok} = OnQueryRes,
{ok, {_, [KafkaMsg]}} = brod:fetch(kafka_hosts(), KafkaTopic, 0, Offset), {ok, {_, [KafkaMsg]}} = brod:fetch(kafka_hosts(), KafkaTopic, 0, Offset),
?assertMatch(#kafka_message{key = BinTime}, KafkaMsg), ?assertMatch(#kafka_message{key = BinTime}, KafkaMsg),
ok = ?PRODUCER:on_stop(InstId, State), ok = ?PRODUCER:on_stop(InstId, State),

View File

@ -124,6 +124,7 @@ init_per_testcase(TestCase, Config0) when
delete_all_bridges(), delete_all_bridges(),
Tid = install_telemetry_handler(TestCase), Tid = install_telemetry_handler(TestCase),
Config = generate_config(Config0), Config = generate_config(Config0),
put(telemetry_table, Tid),
[{telemetry_table, Tid} | Config]; [{telemetry_table, Tid} | Config];
false -> false ->
{skip, no_batching} {skip, no_batching}
@ -133,12 +134,14 @@ init_per_testcase(TestCase, Config0) ->
delete_all_bridges(), delete_all_bridges(),
Tid = install_telemetry_handler(TestCase), Tid = install_telemetry_handler(TestCase),
Config = generate_config(Config0), Config = generate_config(Config0),
put(telemetry_table, Tid),
[{telemetry_table, Tid} | Config]. [{telemetry_table, Tid} | Config].
end_per_testcase(_TestCase, _Config) -> end_per_testcase(_TestCase, _Config) ->
ok = snabbkaffe:stop(), ok = snabbkaffe:stop(),
delete_all_bridges(), delete_all_bridges(),
ok = emqx_connector_web_hook_server:stop(), ok = emqx_connector_web_hook_server:stop(),
emqx_common_test_helpers:call_janitor(),
ok. ok.
%%------------------------------------------------------------------------------ %%------------------------------------------------------------------------------
@ -392,7 +395,11 @@ assert_metrics(ExpectedMetrics, ResourceId) ->
maps:keys(ExpectedMetrics) maps:keys(ExpectedMetrics)
), ),
CurrentMetrics = current_metrics(ResourceId), CurrentMetrics = current_metrics(ResourceId),
?assertEqual(ExpectedMetrics, Metrics, #{current_metrics => CurrentMetrics}), TelemetryTable = get(telemetry_table),
RecordedEvents = ets:tab2list(TelemetryTable),
?assertEqual(ExpectedMetrics, Metrics, #{
current_metrics => CurrentMetrics, recorded_events => RecordedEvents
}),
ok. ok.
assert_empty_metrics(ResourceId) -> assert_empty_metrics(ResourceId) ->
@ -553,6 +560,7 @@ t_publish_success(Config) ->
ResourceId = ?config(resource_id, Config), ResourceId = ?config(resource_id, Config),
ServiceAccountJSON = ?config(service_account_json, Config), ServiceAccountJSON = ?config(service_account_json, Config),
TelemetryTable = ?config(telemetry_table, Config), TelemetryTable = ?config(telemetry_table, Config),
QueryMode = ?config(query_mode, Config),
Topic = <<"t/topic">>, Topic = <<"t/topic">>,
?check_trace( ?check_trace(
create_bridge(Config), create_bridge(Config),
@ -581,6 +589,17 @@ t_publish_success(Config) ->
), ),
%% to avoid test flakiness %% to avoid test flakiness
wait_telemetry_event(TelemetryTable, success, ResourceId), wait_telemetry_event(TelemetryTable, success, ResourceId),
ExpectedInflightEvents =
case QueryMode of
sync -> 1;
async -> 3
end,
wait_telemetry_event(
TelemetryTable,
inflight,
ResourceId,
#{n_events => ExpectedInflightEvents, timeout => 5_000}
),
assert_metrics( assert_metrics(
#{ #{
batching => 0, batching => 0,
@ -600,6 +619,7 @@ t_publish_success_local_topic(Config) ->
ResourceId = ?config(resource_id, Config), ResourceId = ?config(resource_id, Config),
ServiceAccountJSON = ?config(service_account_json, Config), ServiceAccountJSON = ?config(service_account_json, Config),
TelemetryTable = ?config(telemetry_table, Config), TelemetryTable = ?config(telemetry_table, Config),
QueryMode = ?config(query_mode, Config),
LocalTopic = <<"local/topic">>, LocalTopic = <<"local/topic">>,
{ok, _} = create_bridge(Config, #{<<"local_topic">> => LocalTopic}), {ok, _} = create_bridge(Config, #{<<"local_topic">> => LocalTopic}),
assert_empty_metrics(ResourceId), assert_empty_metrics(ResourceId),
@ -618,6 +638,17 @@ t_publish_success_local_topic(Config) ->
), ),
%% to avoid test flakiness %% to avoid test flakiness
wait_telemetry_event(TelemetryTable, success, ResourceId), wait_telemetry_event(TelemetryTable, success, ResourceId),
ExpectedInflightEvents =
case QueryMode of
sync -> 1;
async -> 3
end,
wait_telemetry_event(
TelemetryTable,
inflight,
ResourceId,
#{n_events => ExpectedInflightEvents, timeout => 5_000}
),
assert_metrics( assert_metrics(
#{ #{
batching => 0, batching => 0,
@ -648,6 +679,7 @@ t_publish_templated(Config) ->
ResourceId = ?config(resource_id, Config), ResourceId = ?config(resource_id, Config),
ServiceAccountJSON = ?config(service_account_json, Config), ServiceAccountJSON = ?config(service_account_json, Config),
TelemetryTable = ?config(telemetry_table, Config), TelemetryTable = ?config(telemetry_table, Config),
QueryMode = ?config(query_mode, Config),
Topic = <<"t/topic">>, Topic = <<"t/topic">>,
PayloadTemplate = << PayloadTemplate = <<
"{\"payload\": \"${payload}\"," "{\"payload\": \"${payload}\","
@ -693,6 +725,17 @@ t_publish_templated(Config) ->
), ),
%% to avoid test flakiness %% to avoid test flakiness
wait_telemetry_event(TelemetryTable, success, ResourceId), wait_telemetry_event(TelemetryTable, success, ResourceId),
ExpectedInflightEvents =
case QueryMode of
sync -> 1;
async -> 3
end,
wait_telemetry_event(
TelemetryTable,
inflight,
ResourceId,
#{n_events => ExpectedInflightEvents, timeout => 5_000}
),
assert_metrics( assert_metrics(
#{ #{
batching => 0, batching => 0,
@ -1046,7 +1089,7 @@ do_econnrefused_or_timeout_test(Config, Error) ->
%% response expired, this succeeds. %% response expired, this succeeds.
{econnrefused, async, _} -> {econnrefused, async, _} ->
wait_telemetry_event(TelemetryTable, queuing, ResourceId, #{ wait_telemetry_event(TelemetryTable, queuing, ResourceId, #{
timeout => 10_000, n_events => 2 timeout => 10_000, n_events => 1
}), }),
CurrentMetrics = current_metrics(ResourceId), CurrentMetrics = current_metrics(ResourceId),
RecordedEvents = ets:tab2list(TelemetryTable), RecordedEvents = ets:tab2list(TelemetryTable),
@ -1283,6 +1326,17 @@ t_unrecoverable_error(Config) ->
end end
), ),
wait_telemetry_event(TelemetryTable, failed, ResourceId), wait_telemetry_event(TelemetryTable, failed, ResourceId),
ExpectedInflightEvents =
case QueryMode of
sync -> 1;
async -> 3
end,
wait_telemetry_event(
TelemetryTable,
inflight,
ResourceId,
#{n_events => ExpectedInflightEvents, timeout => 5_000}
),
assert_metrics( assert_metrics(
#{ #{
batching => 0, batching => 0,

View File

@ -132,10 +132,10 @@ defmodule EMQXUmbrella.MixProject do
[ [
{:hstreamdb_erl, github: "hstreamdb/hstreamdb_erl", tag: "0.2.5"}, {:hstreamdb_erl, github: "hstreamdb/hstreamdb_erl", tag: "0.2.5"},
{:influxdb, github: "emqx/influxdb-client-erl", tag: "1.1.4", override: true}, {:influxdb, github: "emqx/influxdb-client-erl", tag: "1.1.4", override: true},
{:wolff, github: "kafka4beam/wolff", tag: "1.7.0"}, {:wolff, github: "kafka4beam/wolff", tag: "1.7.4"},
{:kafka_protocol, github: "kafka4beam/kafka_protocol", tag: "4.1.0", override: true}, {:kafka_protocol, github: "kafka4beam/kafka_protocol", tag: "4.1.2", override: true},
{:brod_gssapi, github: "kafka4beam/brod_gssapi", tag: "v0.1.0-rc1"}, {:brod_gssapi, github: "kafka4beam/brod_gssapi", tag: "v0.1.0-rc1"},
{:brod, github: "kafka4beam/brod", tag: "3.16.4"}, {:brod, github: "kafka4beam/brod", tag: "3.16.7"},
{:snappyer, "1.2.8", override: true}, {:snappyer, "1.2.8", override: true},
{:supervisor3, "1.1.11", override: true} {:supervisor3, "1.1.11", override: true}
] ]

View File

@ -10,8 +10,9 @@ main(_) ->
Conf = [merge(Conf0, Cfgs1), Conf = [merge(Conf0, Cfgs1),
io_lib:nl() io_lib:nl()
], ],
ok = filelib:ensure_dir("apps/emqx_dashboard/priv/i18n.conf"), OutputFile = "apps/emqx_dashboard/priv/i18n.conf",
ok = file:write_file("apps/emqx_dashboard/priv/i18n.conf", Conf). ok = filelib:ensure_dir(OutputFile),
ok = file:write_file(OutputFile, Conf).
merge(BaseConf, Cfgs) -> merge(BaseConf, Cfgs) ->
lists:foldl( lists:foldl(