From 423a30fbb3392538807b1289a401e0f4b7b2ded2 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 19 Apr 2023 21:35:56 +0300 Subject: [PATCH] fix(emqx_alarm): add safe call API to activate/deactivate alarms and use it in resource_manager Don't let 'emqx_resource_manager' crash because of emqx_alarm timeouts. Fixes: EMQX-9529/#10357 --- apps/emqx/src/emqx_alarm.erl | 26 ++++++++++++++++++- .../src/emqx_resource_manager.erl | 6 ++--- changes/ce/fix-10407.en.md | 7 +++++ 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 changes/ce/fix-10407.en.md diff --git a/apps/emqx/src/emqx_alarm.erl b/apps/emqx/src/emqx_alarm.erl index 7255e96da..056f36050 100644 --- a/apps/emqx/src/emqx_alarm.erl +++ b/apps/emqx/src/emqx_alarm.erl @@ -42,7 +42,9 @@ get_alarms/0, get_alarms/1, format/1, - format/2 + format/2, + safe_activate/3, + safe_deactivate/1 ]). %% gen_server callbacks @@ -122,6 +124,9 @@ activate(Name, Details) -> activate(Name, Details, Message) -> gen_server:call(?MODULE, {activate_alarm, Name, Details, Message}). +safe_activate(Name, Details, Message) -> + safe_call({activate_alarm, Name, Details, Message}). + -spec ensure_deactivated(binary() | atom()) -> ok. ensure_deactivated(Name) -> ensure_deactivated(Name, no_details). @@ -154,6 +159,9 @@ deactivate(Name, Details) -> deactivate(Name, Details, Message) -> gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}). +safe_deactivate(Name) -> + safe_call({deactivate_alarm, Name, no_details, <<"">>}). + -spec delete_all_deactivated_alarms() -> ok. delete_all_deactivated_alarms() -> gen_server:call(?MODULE, delete_all_deactivated_alarms). @@ -468,3 +476,19 @@ normalize_message(Name, <<"">>) -> list_to_binary(io_lib:format("~p", [Name])); normalize_message(_Name, Message) -> Message. + +safe_call(Req) -> + try + gen_server:call(?MODULE, Req) + catch + _:{timeout, _} = Reason -> + ?SLOG(warning, #{msg => "emqx_alarm_safe_call_timeout", reason => Reason}), + {error, timeout}; + _:Reason:St -> + ?SLOG(error, #{ + msg => "emqx_alarm_safe_call_exception", + reason => Reason, + stacktrace => St + }), + {error, Reason} + end. diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index b35bade77..877b35fff 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -375,7 +375,7 @@ handle_event(state_timeout, health_check, connecting, Data) -> %% and successful health_checks handle_event(enter, _OldState, connected = State, Data) -> ok = log_state_consistency(State, Data), - _ = emqx_alarm:deactivate(Data#data.id), + _ = emqx_alarm:safe_deactivate(Data#data.id), ?tp(resource_connected_enter, #{}), {keep_state_and_data, health_check_actions(Data)}; handle_event(state_timeout, health_check, connected, Data) -> @@ -618,7 +618,7 @@ maybe_alarm(_Status, ResId, Error, _PrevError) -> {error, undefined} -> <<"Unknown reason">>; {error, Reason} -> emqx_utils:readable_error_msg(Reason) end, - emqx_alarm:activate( + emqx_alarm:safe_activate( ResId, #{resource_id => ResId, reason => resource_down}, <<"resource down: ", HrError/binary>> @@ -636,7 +636,7 @@ maybe_resume_resource_workers(_, _) -> maybe_clear_alarm(<>) -> ok; maybe_clear_alarm(ResId) -> - emqx_alarm:deactivate(ResId). + emqx_alarm:safe_deactivate(ResId). parse_health_check_result(Status, Data) when ?IS_STATUS(Status) -> {Status, Data#data.state, status_to_error(Status)}; diff --git a/changes/ce/fix-10407.en.md b/changes/ce/fix-10407.en.md new file mode 100644 index 000000000..d9df9ce69 --- /dev/null +++ b/changes/ce/fix-10407.en.md @@ -0,0 +1,7 @@ +Improve 'emqx_alarm' performance by using Mnesia dirty operations and avoiding +unnecessary calls from 'emqx_resource_manager' to reactivate alarms that have been already activated. +Use new safe 'emqx_alarm' API to activate/deactivate alarms to ensure that emqx_resource_manager +doesn't crash because of alarm timeouts. +The crashes were possible when the following conditions co-occurred: + - a relatively high number of failing resources, e.g. bridges tried to activate alarms on re-occurring errors; + - the system experienced a very high load.