feat: enable log throttling for potentially flooding log events

This commit is contained in:
Serge Tupchii 2024-02-19 21:30:25 +02:00
parent 98ba300f7c
commit 9bd0d1ba1b
9 changed files with 80 additions and 39 deletions

View File

@ -40,8 +40,8 @@
end end
). ).
%% NOTE: do not forget to add every used msg to the default value of %% NOTE: do not forget to use atom for msg and add every used msg to
%% `log.thorttling.msgs` list. %% the default value of `log.thorttling.msgs` list.
-define(SLOG_THROTTLE(Level, Data), -define(SLOG_THROTTLE(Level, Data),
?SLOG_THROTTLE(Level, Data, #{}) ?SLOG_THROTTLE(Level, Data, #{})
). ).

View File

@ -183,8 +183,13 @@ log_result(#{username := Username}, Topic, Action, From, Result) ->
} }
end, end,
case Result of case Result of
allow -> ?SLOG(info, (LogMeta())#{msg => "authorization_permission_allowed"}); allow ->
deny -> ?SLOG(info, (LogMeta())#{msg => "authorization_permission_denied"}) ?SLOG(info, (LogMeta())#{msg => "authorization_permission_allowed"});
deny ->
?SLOG_THROTTLE(
warning,
(LogMeta())#{msg => authorization_permission_denied}
)
end. end.
%% @private Format authorization rules source. %% @private Format authorization rules source.

View File

@ -616,10 +616,10 @@ process_publish(Packet = ?PUBLISH_PACKET(QoS, Topic, PacketId), Channel) ->
Msg = packet_to_message(NPacket, NChannel), Msg = packet_to_message(NPacket, NChannel),
do_publish(PacketId, Msg, NChannel); do_publish(PacketId, Msg, NChannel);
{error, Rc = ?RC_NOT_AUTHORIZED, NChannel} -> {error, Rc = ?RC_NOT_AUTHORIZED, NChannel} ->
?SLOG( ?SLOG_THROTTLE(
info, warning,
#{ #{
msg => "cannot_publish_to_topic", msg => cannot_publish_to_topic_due_to_not_authorized,
reason => emqx_reason_codes:name(Rc) reason => emqx_reason_codes:name(Rc)
}, },
#{topic => Topic} #{topic => Topic}
@ -635,10 +635,10 @@ process_publish(Packet = ?PUBLISH_PACKET(QoS, Topic, PacketId), Channel) ->
handle_out(disconnect, Rc, NChannel) handle_out(disconnect, Rc, NChannel)
end; end;
{error, Rc = ?RC_QUOTA_EXCEEDED, NChannel} -> {error, Rc = ?RC_QUOTA_EXCEEDED, NChannel} ->
?SLOG( ?SLOG_THROTTLE(
info, warning,
#{ #{
msg => "cannot_publish_to_topic", msg => cannot_publish_to_topic_due_to_quota_exceeded,
reason => emqx_reason_codes:name(Rc) reason => emqx_reason_codes:name(Rc)
}, },
#{topic => Topic} #{topic => Topic}

View File

@ -50,10 +50,10 @@
-define(MSGS_LIST, emqx:get_config([log, throttling, msgs], [])). -define(MSGS_LIST, emqx:get_config([log, throttling, msgs], [])).
-define(TIME_WINDOW_MS, timer:seconds(emqx:get_config([log, throttling, time_window], 60))). -define(TIME_WINDOW_MS, timer:seconds(emqx:get_config([log, throttling, time_window], 60))).
-spec allow(logger:level(), string()) -> boolean(). -spec allow(logger:level(), atom()) -> boolean().
allow(debug, _Msg) -> allow(debug, _Msg) ->
true; true;
allow(_Level, Msg) -> allow(_Level, Msg) when is_atom(Msg) ->
Seq = persistent_term:get(?SEQ_ID(Msg), undefined), Seq = persistent_term:get(?SEQ_ID(Msg), undefined),
case Seq of case Seq of
undefined -> undefined ->
@ -79,8 +79,9 @@ start_link() ->
init([]) -> init([]) ->
ok = lists:foreach(fun(Msg) -> ?NEW_THROTTLE(Msg, ?NEW_SEQ) end, ?MSGS_LIST), ok = lists:foreach(fun(Msg) -> ?NEW_THROTTLE(Msg, ?NEW_SEQ) end, ?MSGS_LIST),
TimerRef = schedule_refresh(?TIME_WINDOW_MS), CurrentPeriodMs = ?TIME_WINDOW_MS,
{ok, #{timer_ref => TimerRef}}. TimerRef = schedule_refresh(CurrentPeriodMs),
{ok, #{timer_ref => TimerRef, current_period_ms => CurrentPeriodMs}}.
handle_call(Req, _From, State) -> handle_call(Req, _From, State) ->
?SLOG(error, #{msg => "unexpected_call", call => Req}), ?SLOG(error, #{msg => "unexpected_call", call => Req}),
@ -90,8 +91,7 @@ handle_cast(Msg, State) ->
?SLOG(error, #{msg => "unexpected_cast", cast => Msg}), ?SLOG(error, #{msg => "unexpected_cast", cast => Msg}),
{noreply, State}. {noreply, State}.
handle_info(refresh, State) -> handle_info(refresh, #{current_period_ms := PeriodMs} = State) ->
PeriodMs = ?TIME_WINDOW_MS,
Msgs = ?MSGS_LIST, Msgs = ?MSGS_LIST,
DroppedStats = lists:foldl( DroppedStats = lists:foldl(
fun(Msg, Acc) -> fun(Msg, Acc) ->
@ -112,7 +112,11 @@ handle_info(refresh, State) ->
Msgs Msgs
), ),
maybe_log_dropped(DroppedStats, PeriodMs), maybe_log_dropped(DroppedStats, PeriodMs),
State1 = State#{timer_ref => schedule_refresh(PeriodMs)}, NewPeriodMs = ?TIME_WINDOW_MS,
State1 = State#{
timer_ref => schedule_refresh(NewPeriodMs),
current_period_ms => NewPeriodMs
},
{noreply, State1}; {noreply, State1};
handle_info(Info, State) -> handle_info(Info, State) ->
?SLOG(error, #{msg => "unxpected_info", info => Info}), ?SLOG(error, #{msg => "unxpected_info", info => Info}),
@ -143,4 +147,5 @@ maybe_log_dropped(_DroppedStats, _PeriodMs) ->
ok. ok.
schedule_refresh(PeriodMs) -> schedule_refresh(PeriodMs) ->
?tp(log_throttler_sched_refresh, #{new_period_ms => PeriodMs}),
erlang:send_after(PeriodMs, ?MODULE, refresh). erlang:send_after(PeriodMs, ?MODULE, refresh).

View File

@ -62,10 +62,10 @@ handle_event(ClientInfo, {dropped, Msg, #{reason := queue_full, logctx := Ctx}})
ok = emqx_metrics:inc('delivery.dropped.queue_full'), ok = emqx_metrics:inc('delivery.dropped.queue_full'),
ok = inc_pd('send_msg.dropped', 1), ok = inc_pd('send_msg.dropped', 1),
ok = inc_pd('send_msg.dropped.queue_full', 1), ok = inc_pd('send_msg.dropped.queue_full', 1),
?SLOG( ?SLOG_THROTTLE(
info, warning,
Ctx#{ Ctx#{
msg => "dropped_msg_due_to_mqueue_is_full", msg => dropped_msg_due_to_mqueue_is_full,
payload => Msg#message.payload payload => Msg#message.payload
}, },
#{topic => Msg#message.topic} #{topic => Msg#message.topic}

View File

@ -23,8 +23,10 @@
-include_lib("common_test/include/ct.hrl"). -include_lib("common_test/include/ct.hrl").
-include_lib("snabbkaffe/include/snabbkaffe.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl").
-define(THROTTLE_MSG, "test_throttle_msg"). %% Have to use real msgs, as the schema is guarded by enum.
-define(THROTTLE_MSG1, "test_throttle_msg1"). -define(THROTTLE_MSG, authorization_permission_denied).
-define(THROTTLE_MSG1, cannot_publish_to_topic_due_to_not_authorized).
-define(TIME_WINDOW, <<"1s">>).
all() -> emqx_common_test_helpers:all(?MODULE). all() -> emqx_common_test_helpers:all(?MODULE).
@ -39,7 +41,7 @@ init_per_suite(Config) ->
#{ #{
log => #{ log => #{
throttling => #{ throttling => #{
time_window => <<"1s">>, msgs => [?THROTTLE_MSG] time_window => ?TIME_WINDOW, msgs => [?THROTTLE_MSG]
} }
} }
} }
@ -70,6 +72,10 @@ end_per_testcase(t_throttle_add_new_msg, _Config) ->
ok = snabbkaffe:stop(), ok = snabbkaffe:stop(),
{ok, _} = emqx_conf:update([log, throttling, msgs], [?THROTTLE_MSG], #{}), {ok, _} = emqx_conf:update([log, throttling, msgs], [?THROTTLE_MSG], #{}),
ok; ok;
end_per_testcase(t_update_time_window, _Config) ->
ok = snabbkaffe:stop(),
{ok, _} = emqx_conf:update([log, throttling, time_window], ?TIME_WINDOW, #{}),
ok;
end_per_testcase(_TC, _Config) -> end_per_testcase(_TC, _Config) ->
ok = snabbkaffe:stop(). ok = snabbkaffe:stop().
@ -87,7 +93,7 @@ t_throttle(_Config) ->
lists:seq(1, 100) lists:seq(1, 100)
), ),
{ok, _} = ?block_until( {ok, _} = ?block_until(
#{?snk_kind := log_throttler_dropped, throttled_msg := ?THROTTLE_MSG}, 3000 #{?snk_kind := log_throttler_dropped, throttled_msg := ?THROTTLE_MSG}, 5000
), ),
?assert(emqx_log_throttler:allow(warning, ?THROTTLE_MSG)), ?assert(emqx_log_throttler:allow(warning, ?THROTTLE_MSG)),
@ -110,7 +116,7 @@ t_throttle_add_new_msg(_Config) ->
?check_trace( ?check_trace(
begin begin
?block_until( ?block_until(
#{?snk_kind := log_throttler_new_msg, throttled_msg := ?THROTTLE_MSG1}, 3000 #{?snk_kind := log_throttler_new_msg, throttled_msg := ?THROTTLE_MSG1}, 5000
), ),
?assert(emqx_log_throttler:allow(warning, ?THROTTLE_MSG1)), ?assert(emqx_log_throttler:allow(warning, ?THROTTLE_MSG1)),
?assertNot(emqx_log_throttler:allow(warning, ?THROTTLE_MSG1)), ?assertNot(emqx_log_throttler:allow(warning, ?THROTTLE_MSG1)),
@ -128,11 +134,25 @@ t_throttle_add_new_msg(_Config) ->
t_throttle_no_msg(_Config) -> t_throttle_no_msg(_Config) ->
%% Must simply pass with no crashes %% Must simply pass with no crashes
?assert(emqx_log_throttler:allow(warning, "no_test_throttle_msg")), ?assert(emqx_log_throttler:allow(warning, no_test_throttle_msg)),
?assert(emqx_log_throttler:allow(warning, "no_test_throttle_msg")), ?assert(emqx_log_throttler:allow(warning, no_test_throttle_msg)),
timer:sleep(10), timer:sleep(10),
?assert(erlang:is_process_alive(erlang:whereis(emqx_log_throttler))). ?assert(erlang:is_process_alive(erlang:whereis(emqx_log_throttler))).
t_update_time_window(_Config) ->
?check_trace(
begin
?wait_async_action(
emqx_conf:update([log, throttling, time_window], <<"2s">>, #{}),
#{?snk_kind := log_throttler_sched_refresh, new_period_ms := 2000},
5000
),
timer:sleep(10),
?assert(erlang:is_process_alive(erlang:whereis(emqx_log_throttler)))
end,
[]
).
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------
%% internal functions %% internal functions
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------

View File

@ -75,6 +75,14 @@
%% 1 million default ports counter %% 1 million default ports counter
-define(DEFAULT_MAX_PORTS, 1024 * 1024). -define(DEFAULT_MAX_PORTS, 1024 * 1024).
-define(LOG_THROTTLING_MSGS, [
authorization_permission_denied,
cannot_publish_to_topic_due_to_not_authorized,
cannot_publish_to_topic_due_to_quota_exceeded,
connection_rejected_due_to_license_limit_reached,
dropped_msg_due_to_mqueue_is_full
]).
%% Callback to upgrade config after loaded from config file but before validation. %% Callback to upgrade config after loaded from config file but before validation.
upgrade_raw_conf(Raw0) -> upgrade_raw_conf(Raw0) ->
Raw1 = emqx_connector_schema:transform_bridges_v1_to_connectors_and_bridges_v2(Raw0), Raw1 = emqx_connector_schema:transform_bridges_v1_to_connectors_and_bridges_v2(Raw0),
@ -910,7 +918,7 @@ fields("log") ->
importance => ?IMPORTANCE_HIGH importance => ?IMPORTANCE_HIGH
} }
)}, )},
{"throttling", {throttling,
sc(?R_REF("log_throttling"), #{ sc(?R_REF("log_throttling"), #{
desc => ?DESC("log_throttling"), desc => ?DESC("log_throttling"),
importance => ?IMPORTANCE_MEDIUM importance => ?IMPORTANCE_MEDIUM
@ -1019,22 +1027,22 @@ fields("log_burst_limit") ->
]; ];
fields("log_throttling") -> fields("log_throttling") ->
[ [
{"window_time", {time_window,
sc( sc(
emqx_schema:duration_s(), emqx_schema:duration_s(),
#{ #{
default => <<"1m">>, default => <<"1m">>,
desc => ?DESC("log_throttling_window_time"), desc => ?DESC("log_throttling_time_window"),
importance => ?IMPORTANCE_MEDIUM importance => ?IMPORTANCE_MEDIUM
} }
)}, )},
%% A static list of event ids used in ?SLOG_THROTTLE/3,4 macro. %% A static list of msgs used in ?SLOG_THROTTLE/2,3 macro.
%% For internal (developer) use only. %% For internal (developer) use only.
{"event_ids", {msgs,
sc( sc(
hoconsc:array(atom()), hoconsc:array(hoconsc:enum(?LOG_THROTTLING_MSGS)),
#{ #{
default => [], default => ?LOG_THROTTLING_MSGS,
importance => ?IMPORTANCE_HIDDEN importance => ?IMPORTANCE_HIDDEN
} }
)} )}

View File

@ -85,7 +85,10 @@ check(_ConnInfo, AckProps) ->
{ok, #{max_connections := MaxClients}} -> {ok, #{max_connections := MaxClients}} ->
case check_max_clients_exceeded(MaxClients) of case check_max_clients_exceeded(MaxClients) of
true -> true ->
?SLOG(info, #{msg => "connection_rejected_due_to_license_limit_reached"}), ?SLOG_THROTTLE(
error,
#{msg => connection_rejected_due_to_license_limit_reached}
),
{stop, {error, ?RC_QUOTA_EXCEEDED}}; {stop, {error, ?RC_QUOTA_EXCEEDED}};
false -> false ->
{ok, AckProps} {ok, AckProps}

View File

@ -482,11 +482,11 @@ desc_log_throttling.desc:
"""Log throttling feature reduces the number of potentially flooding logged events by """Log throttling feature reduces the number of potentially flooding logged events by
dropping all but the first event within a configured time window.""" dropping all but the first event within a configured time window."""
log_throttling_window_time.desc: log_throttling_time_window.desc:
"""A time interval at which log throttling is applied. Defaults to 1 minute.""" """For throttled messages, only log 1 in each time window."""
log_throttling_window_time.label: log_throttling_time_window.label:
"""Log Throttling Window Time""" """Log Throttling Time Window"""
cluster_dns_record_type.desc: cluster_dns_record_type.desc:
"""DNS record type.""" """DNS record type."""