fix(emqx_alarm): add safe call API to activate/deactivate alarms and use it in resource_manager

Don't let 'emqx_resource_manager' crash because of emqx_alarm timeouts.

Fixes: EMQX-9529/#10357
This commit is contained in:
Serge Tupchii 2023-04-19 21:35:56 +03:00
parent b960d2ecb3
commit 423a30fbb3
3 changed files with 35 additions and 4 deletions

View File

@ -42,7 +42,9 @@
get_alarms/0, get_alarms/0,
get_alarms/1, get_alarms/1,
format/1, format/1,
format/2 format/2,
safe_activate/3,
safe_deactivate/1
]). ]).
%% gen_server callbacks %% gen_server callbacks
@ -122,6 +124,9 @@ activate(Name, Details) ->
activate(Name, Details, Message) -> activate(Name, Details, Message) ->
gen_server:call(?MODULE, {activate_alarm, Name, Details, Message}). gen_server:call(?MODULE, {activate_alarm, Name, Details, Message}).
safe_activate(Name, Details, Message) ->
safe_call({activate_alarm, Name, Details, Message}).
-spec ensure_deactivated(binary() | atom()) -> ok. -spec ensure_deactivated(binary() | atom()) -> ok.
ensure_deactivated(Name) -> ensure_deactivated(Name) ->
ensure_deactivated(Name, no_details). ensure_deactivated(Name, no_details).
@ -154,6 +159,9 @@ deactivate(Name, Details) ->
deactivate(Name, Details, Message) -> deactivate(Name, Details, Message) ->
gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}). gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}).
safe_deactivate(Name) ->
safe_call({deactivate_alarm, Name, no_details, <<"">>}).
-spec delete_all_deactivated_alarms() -> ok. -spec delete_all_deactivated_alarms() -> ok.
delete_all_deactivated_alarms() -> delete_all_deactivated_alarms() ->
gen_server:call(?MODULE, delete_all_deactivated_alarms). gen_server:call(?MODULE, delete_all_deactivated_alarms).
@ -468,3 +476,19 @@ normalize_message(Name, <<"">>) ->
list_to_binary(io_lib:format("~p", [Name])); list_to_binary(io_lib:format("~p", [Name]));
normalize_message(_Name, Message) -> normalize_message(_Name, Message) ->
Message. Message.
safe_call(Req) ->
try
gen_server:call(?MODULE, Req)
catch
_:{timeout, _} = Reason ->
?SLOG(warning, #{msg => "emqx_alarm_safe_call_timeout", reason => Reason}),
{error, timeout};
_:Reason:St ->
?SLOG(error, #{
msg => "emqx_alarm_safe_call_exception",
reason => Reason,
stacktrace => St
}),
{error, Reason}
end.

View File

@ -375,7 +375,7 @@ handle_event(state_timeout, health_check, connecting, Data) ->
%% and successful health_checks %% and successful health_checks
handle_event(enter, _OldState, connected = State, Data) -> handle_event(enter, _OldState, connected = State, Data) ->
ok = log_state_consistency(State, Data), ok = log_state_consistency(State, Data),
_ = emqx_alarm:deactivate(Data#data.id), _ = emqx_alarm:safe_deactivate(Data#data.id),
?tp(resource_connected_enter, #{}), ?tp(resource_connected_enter, #{}),
{keep_state_and_data, health_check_actions(Data)}; {keep_state_and_data, health_check_actions(Data)};
handle_event(state_timeout, health_check, connected, Data) -> handle_event(state_timeout, health_check, connected, Data) ->
@ -618,7 +618,7 @@ maybe_alarm(_Status, ResId, Error, _PrevError) ->
{error, undefined} -> <<"Unknown reason">>; {error, undefined} -> <<"Unknown reason">>;
{error, Reason} -> emqx_utils:readable_error_msg(Reason) {error, Reason} -> emqx_utils:readable_error_msg(Reason)
end, end,
emqx_alarm:activate( emqx_alarm:safe_activate(
ResId, ResId,
#{resource_id => ResId, reason => resource_down}, #{resource_id => ResId, reason => resource_down},
<<"resource down: ", HrError/binary>> <<"resource down: ", HrError/binary>>
@ -636,7 +636,7 @@ maybe_resume_resource_workers(_, _) ->
maybe_clear_alarm(<<?TEST_ID_PREFIX, _/binary>>) -> maybe_clear_alarm(<<?TEST_ID_PREFIX, _/binary>>) ->
ok; ok;
maybe_clear_alarm(ResId) -> maybe_clear_alarm(ResId) ->
emqx_alarm:deactivate(ResId). emqx_alarm:safe_deactivate(ResId).
parse_health_check_result(Status, Data) when ?IS_STATUS(Status) -> parse_health_check_result(Status, Data) when ?IS_STATUS(Status) ->
{Status, Data#data.state, status_to_error(Status)}; {Status, Data#data.state, status_to_error(Status)};

View File

@ -0,0 +1,7 @@
Improve 'emqx_alarm' performance by using Mnesia dirty operations and avoiding
unnecessary calls from 'emqx_resource_manager' to reactivate alarms that have been already activated.
Use new safe 'emqx_alarm' API to activate/deactivate alarms to ensure that emqx_resource_manager
doesn't crash because of alarm timeouts.
The crashes were possible when the following conditions co-occurred:
- a relatively high number of failing resources, e.g. bridges tried to activate alarms on re-occurring errors;
- the system experienced a very high load.