fix(emqx_alarm): add safe call API to activate/deactivate alarms and use it in resource_manager
Don't let 'emqx_resource_manager' crash because of emqx_alarm timeouts. Fixes: EMQX-9529/#10357
This commit is contained in:
parent
b960d2ecb3
commit
423a30fbb3
|
@ -42,7 +42,9 @@
|
||||||
get_alarms/0,
|
get_alarms/0,
|
||||||
get_alarms/1,
|
get_alarms/1,
|
||||||
format/1,
|
format/1,
|
||||||
format/2
|
format/2,
|
||||||
|
safe_activate/3,
|
||||||
|
safe_deactivate/1
|
||||||
]).
|
]).
|
||||||
|
|
||||||
%% gen_server callbacks
|
%% gen_server callbacks
|
||||||
|
@ -122,6 +124,9 @@ activate(Name, Details) ->
|
||||||
activate(Name, Details, Message) ->
|
activate(Name, Details, Message) ->
|
||||||
gen_server:call(?MODULE, {activate_alarm, Name, Details, Message}).
|
gen_server:call(?MODULE, {activate_alarm, Name, Details, Message}).
|
||||||
|
|
||||||
|
safe_activate(Name, Details, Message) ->
|
||||||
|
safe_call({activate_alarm, Name, Details, Message}).
|
||||||
|
|
||||||
-spec ensure_deactivated(binary() | atom()) -> ok.
|
-spec ensure_deactivated(binary() | atom()) -> ok.
|
||||||
ensure_deactivated(Name) ->
|
ensure_deactivated(Name) ->
|
||||||
ensure_deactivated(Name, no_details).
|
ensure_deactivated(Name, no_details).
|
||||||
|
@ -154,6 +159,9 @@ deactivate(Name, Details) ->
|
||||||
deactivate(Name, Details, Message) ->
|
deactivate(Name, Details, Message) ->
|
||||||
gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}).
|
gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}).
|
||||||
|
|
||||||
|
safe_deactivate(Name) ->
|
||||||
|
safe_call({deactivate_alarm, Name, no_details, <<"">>}).
|
||||||
|
|
||||||
-spec delete_all_deactivated_alarms() -> ok.
|
-spec delete_all_deactivated_alarms() -> ok.
|
||||||
delete_all_deactivated_alarms() ->
|
delete_all_deactivated_alarms() ->
|
||||||
gen_server:call(?MODULE, delete_all_deactivated_alarms).
|
gen_server:call(?MODULE, delete_all_deactivated_alarms).
|
||||||
|
@ -468,3 +476,19 @@ normalize_message(Name, <<"">>) ->
|
||||||
list_to_binary(io_lib:format("~p", [Name]));
|
list_to_binary(io_lib:format("~p", [Name]));
|
||||||
normalize_message(_Name, Message) ->
|
normalize_message(_Name, Message) ->
|
||||||
Message.
|
Message.
|
||||||
|
|
||||||
|
safe_call(Req) ->
|
||||||
|
try
|
||||||
|
gen_server:call(?MODULE, Req)
|
||||||
|
catch
|
||||||
|
_:{timeout, _} = Reason ->
|
||||||
|
?SLOG(warning, #{msg => "emqx_alarm_safe_call_timeout", reason => Reason}),
|
||||||
|
{error, timeout};
|
||||||
|
_:Reason:St ->
|
||||||
|
?SLOG(error, #{
|
||||||
|
msg => "emqx_alarm_safe_call_exception",
|
||||||
|
reason => Reason,
|
||||||
|
stacktrace => St
|
||||||
|
}),
|
||||||
|
{error, Reason}
|
||||||
|
end.
|
||||||
|
|
|
@ -375,7 +375,7 @@ handle_event(state_timeout, health_check, connecting, Data) ->
|
||||||
%% and successful health_checks
|
%% and successful health_checks
|
||||||
handle_event(enter, _OldState, connected = State, Data) ->
|
handle_event(enter, _OldState, connected = State, Data) ->
|
||||||
ok = log_state_consistency(State, Data),
|
ok = log_state_consistency(State, Data),
|
||||||
_ = emqx_alarm:deactivate(Data#data.id),
|
_ = emqx_alarm:safe_deactivate(Data#data.id),
|
||||||
?tp(resource_connected_enter, #{}),
|
?tp(resource_connected_enter, #{}),
|
||||||
{keep_state_and_data, health_check_actions(Data)};
|
{keep_state_and_data, health_check_actions(Data)};
|
||||||
handle_event(state_timeout, health_check, connected, Data) ->
|
handle_event(state_timeout, health_check, connected, Data) ->
|
||||||
|
@ -618,7 +618,7 @@ maybe_alarm(_Status, ResId, Error, _PrevError) ->
|
||||||
{error, undefined} -> <<"Unknown reason">>;
|
{error, undefined} -> <<"Unknown reason">>;
|
||||||
{error, Reason} -> emqx_utils:readable_error_msg(Reason)
|
{error, Reason} -> emqx_utils:readable_error_msg(Reason)
|
||||||
end,
|
end,
|
||||||
emqx_alarm:activate(
|
emqx_alarm:safe_activate(
|
||||||
ResId,
|
ResId,
|
||||||
#{resource_id => ResId, reason => resource_down},
|
#{resource_id => ResId, reason => resource_down},
|
||||||
<<"resource down: ", HrError/binary>>
|
<<"resource down: ", HrError/binary>>
|
||||||
|
@ -636,7 +636,7 @@ maybe_resume_resource_workers(_, _) ->
|
||||||
maybe_clear_alarm(<<?TEST_ID_PREFIX, _/binary>>) ->
|
maybe_clear_alarm(<<?TEST_ID_PREFIX, _/binary>>) ->
|
||||||
ok;
|
ok;
|
||||||
maybe_clear_alarm(ResId) ->
|
maybe_clear_alarm(ResId) ->
|
||||||
emqx_alarm:deactivate(ResId).
|
emqx_alarm:safe_deactivate(ResId).
|
||||||
|
|
||||||
parse_health_check_result(Status, Data) when ?IS_STATUS(Status) ->
|
parse_health_check_result(Status, Data) when ?IS_STATUS(Status) ->
|
||||||
{Status, Data#data.state, status_to_error(Status)};
|
{Status, Data#data.state, status_to_error(Status)};
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
Improve 'emqx_alarm' performance by using Mnesia dirty operations and avoiding
|
||||||
|
unnecessary calls from 'emqx_resource_manager' to reactivate alarms that have been already activated.
|
||||||
|
Use new safe 'emqx_alarm' API to activate/deactivate alarms to ensure that emqx_resource_manager
|
||||||
|
doesn't crash because of alarm timeouts.
|
||||||
|
The crashes were possible when the following conditions co-occurred:
|
||||||
|
- a relatively high number of failing resources, e.g. bridges tried to activate alarms on re-occurring errors;
|
||||||
|
- the system experienced a very high load.
|
Loading…
Reference in New Issue