fix(os_mon): do mem check and alarm triggers in emqx_os_mon

This commit is contained in:
Zaiming (Stone) Shi 2022-05-13 12:44:27 +02:00
parent 95f81126ca
commit 1ea0639321
4 changed files with 116 additions and 77 deletions

View File

@ -35,6 +35,7 @@
deactivate/1, deactivate/1,
deactivate/2, deactivate/2,
deactivate/3, deactivate/3,
ensure_deactivated/3,
delete_all_deactivated_alarms/0, delete_all_deactivated_alarms/0,
get_alarms/0, get_alarms/0,
get_alarms/1, get_alarms/1,
@ -120,6 +121,17 @@ deactivate(Name) ->
deactivate(Name, Details) -> deactivate(Name, Details) ->
deactivate(Name, Details, <<"">>). deactivate(Name, Details, <<"">>).
ensure_deactivated(Name, Details, Message) ->
case mnesia:dirty_read(?ACTIVATED_ALARM, Name) of
[] ->
ok;
_ ->
case deactivate(Name, Details, Message) of
{error, not_found} -> ok;
Other -> Other
end
end.
deactivate(Name, Details, Message) -> deactivate(Name, Details, Message) ->
gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}). gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}).

View File

@ -56,18 +56,6 @@ init({_Args, {alarm_handler, _ExistingAlarms}}) ->
init(_) -> init(_) ->
{ok, []}. {ok, []}.
handle_event({set_alarm, {system_memory_high_watermark, []}}, State) ->
HighWatermark = emqx_os_mon:get_sysmem_high_watermark(),
Message = to_bin("System memory usage is higher than ~p%", [HighWatermark]),
emqx_alarm:activate(
high_system_memory_usage,
#{
high_watermark => HighWatermark,
percent => emqx_os_mon:current_sysmem_percent()
},
Message
),
{ok, State};
handle_event({set_alarm, {process_memory_high_watermark, Pid}}, State) -> handle_event({set_alarm, {process_memory_high_watermark, Pid}}, State) ->
HighWatermark = emqx_os_mon:get_procmem_high_watermark(), HighWatermark = emqx_os_mon:get_procmem_high_watermark(),
Message = to_bin("Process memory usage is higher than ~p%", [HighWatermark]), Message = to_bin("Process memory usage is higher than ~p%", [HighWatermark]),
@ -80,9 +68,6 @@ handle_event({set_alarm, {process_memory_high_watermark, Pid}}, State) ->
Message Message
), ),
{ok, State}; {ok, State};
handle_event({clear_alarm, system_memory_high_watermark}, State) ->
_ = emqx_alarm:deactivate(high_system_memory_usage),
{ok, State};
handle_event({clear_alarm, process_memory_high_watermark}, State) -> handle_event({clear_alarm, process_memory_high_watermark}, State) ->
_ = emqx_alarm:deactivate(high_process_memory_usage), _ = emqx_alarm:deactivate(high_process_memory_usage),
{ok, State}; {ok, State};

View File

@ -65,10 +65,10 @@ set_mem_check_interval(Seconds) ->
memsup:set_check_interval(Seconds div 60000). memsup:set_check_interval(Seconds div 60000).
get_sysmem_high_watermark() -> get_sysmem_high_watermark() ->
memsup:get_sysmem_high_watermark(). gen_server:call(?OS_MON, ?FUNCTION_NAME, infinity).
set_sysmem_high_watermark(Float) -> set_sysmem_high_watermark(Float) ->
memsup:set_sysmem_high_watermark(Float). gen_server:call(?OS_MON, {?FUNCTION_NAME, Float}, infinity).
get_procmem_high_watermark() -> get_procmem_high_watermark() ->
memsup:get_procmem_high_watermark(). memsup:get_procmem_high_watermark().
@ -79,7 +79,7 @@ set_procmem_high_watermark(Float) ->
current_sysmem_percent() -> current_sysmem_percent() ->
case load_ctl:get_memory_usage() of case load_ctl:get_memory_usage() of
0 -> 0 ->
undefined; 0;
Ratio -> Ratio ->
erlang:floor(Ratio * 10000) / 100 erlang:floor(Ratio * 10000) / 100
end. end.
@ -89,19 +89,26 @@ current_sysmem_percent() ->
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------
init([]) -> init([]) ->
%% memsup is not reliable, ignore
memsup:set_sysmem_high_watermark(1.0),
#{ #{
sysmem_high_watermark := SysHW, sysmem_high_watermark := SysHW,
procmem_high_watermark := PHW, procmem_high_watermark := PHW,
mem_check_interval := MCI mem_check_interval := MCI
} = emqx:get_config([sysmon, os]), } = emqx:get_config([sysmon, os]),
set_sysmem_high_watermark(SysHW),
set_procmem_high_watermark(PHW), set_procmem_high_watermark(PHW),
set_mem_check_interval(MCI), set_mem_check_interval(MCI),
ensure_system_memory_alarm(SysHW), update_mem_alarm_stauts(SysHW),
_ = start_check_timer(), _ = start_mem_check_timer(),
{ok, #{}}. _ = start_cpu_check_timer(),
{ok, #{sysmem_high_watermark => SysHW}}.
handle_call(get_sysmem_high_watermark, _From, #{sysmem_high_watermark := HWM} = State) ->
{reply, HWM, State};
handle_call({set_sysmem_high_watermark, New}, _From, #{sysmem_high_watermark := _Old} = State) ->
ok = update_mem_alarm_stauts(New),
{reply, ok, State#{sysmem_high_watermark := New}};
handle_call(Req, _From, State) -> handle_call(Req, _From, State) ->
{reply, {error, {unexpected_call, Req}}, State}. {reply, {error, {unexpected_call, Req}}, State}.
@ -109,18 +116,21 @@ handle_cast(Msg, State) ->
?SLOG(error, #{msg => "unexpected_cast", cast => Msg}), ?SLOG(error, #{msg => "unexpected_cast", cast => Msg}),
{noreply, State}. {noreply, State}.
handle_info({timeout, _Timer, check}, State) -> handle_info({timeout, _Timer, mem_check}, #{sysmem_high_watermark := HWM} = State) ->
ok = update_mem_alarm_stauts(HWM),
ok = start_mem_check_timer(),
{noreply, State};
handle_info({timeout, _Timer, cpu_check}, State) ->
CPUHighWatermark = emqx:get_config([sysmon, os, cpu_high_watermark]) * 100, CPUHighWatermark = emqx:get_config([sysmon, os, cpu_high_watermark]) * 100,
CPULowWatermark = emqx:get_config([sysmon, os, cpu_low_watermark]) * 100, CPULowWatermark = emqx:get_config([sysmon, os, cpu_low_watermark]) * 100,
%% TODO: should be improved? %% TODO: should be improved?
_ =
case emqx_vm:cpu_util() of case emqx_vm:cpu_util() of
0 -> 0 ->
ok; ok;
Busy when Busy > CPUHighWatermark -> Busy when Busy > CPUHighWatermark ->
Usage = list_to_binary(io_lib:format("~.2f%", [Busy])), Usage = list_to_binary(io_lib:format("~.2f%", [Busy])),
Message = <<Usage/binary, " cpu usage">>, Message = <<Usage/binary, " cpu usage">>,
emqx_alarm:activate( _ = emqx_alarm:activate(
high_cpu_usage, high_cpu_usage,
#{ #{
usage => Usage, usage => Usage,
@ -128,12 +138,11 @@ handle_info({timeout, _Timer, check}, State) ->
low_watermark => CPULowWatermark low_watermark => CPULowWatermark
}, },
Message Message
), );
start_check_timer();
Busy when Busy < CPULowWatermark -> Busy when Busy < CPULowWatermark ->
Usage = list_to_binary(io_lib:format("~.2f%", [Busy])), Usage = list_to_binary(io_lib:format("~.2f%", [Busy])),
Message = <<Usage/binary, " cpu usage">>, Message = <<Usage/binary, " cpu usage">>,
emqx_alarm:deactivate( ok = emqx_alarm:ensure_deactivated(
high_cpu_usage, high_cpu_usage,
#{ #{
usage => Usage, usage => Usage,
@ -141,11 +150,11 @@ handle_info({timeout, _Timer, check}, State) ->
low_watermark => CPULowWatermark low_watermark => CPULowWatermark
}, },
Message Message
), );
start_check_timer();
_Busy -> _Busy ->
start_check_timer() ok
end, end,
ok = start_cpu_check_timer(),
{noreply, State}; {noreply, State};
handle_info(Info, State) -> handle_info(Info, State) ->
?SLOG(error, #{msg => "unexpected_info", info => Info}), ?SLOG(error, #{msg => "unexpected_info", info => Info}),
@ -161,26 +170,61 @@ code_change(_OldVsn, State, _Extra) ->
%% Internal functions %% Internal functions
%%-------------------------------------------------------------------- %%--------------------------------------------------------------------
start_check_timer() -> start_cpu_check_timer() ->
Interval = emqx:get_config([sysmon, os, cpu_check_interval]), Interval = emqx:get_config([sysmon, os, cpu_check_interval]),
case erlang:system_info(system_architecture) of case erlang:system_info(system_architecture) of
"x86_64-pc-linux-musl" -> ok; "x86_64-pc-linux-musl" -> ok;
_ -> emqx_misc:start_timer(Interval, check) _ -> _ = emqx_misc:start_timer(Interval, cpu_check)
end. end,
ok.
start_mem_check_timer() ->
Interval = emqx:get_config([sysmon, os, mem_check_interval]),
IsSupported =
case os:type() of
{unix, linux} ->
true;
_ ->
%% sorry Mac and windows, for now
false
end,
case is_integer(Interval) andalso IsSupported of
true ->
_ = emqx_misc:start_timer(Interval, mem_check);
false ->
ok
end,
ok.
%% At startup, memsup starts first and checks for memory alarms, update_mem_alarm_stauts(HWM) when HWM > 1.0 orelse HWM < 0.0 ->
%% but emqx_alarm_handler is not yet used instead of alarm_handler, ?SLOG(warning, #{msg => "discarded_out_of_range_mem_alarm_threshold", value => HWM}),
%% so alarm_handler is used directly for notification (normally emqx_alarm_handler should be used). ok = emqx_alarm:ensure_deactivated(
%%The internal memsup will no longer trigger events that have been alerted, high_system_memory_usage,
%% and there is no exported function to remove the alerted flag, #{},
%% so it can only be checked again at startup. <<"Deactivated mem usage alarm due to out of range threshold">>
);
ensure_system_memory_alarm(HW) when HW =< 1.0 andalso HW >= 0 -> update_mem_alarm_stauts(HWM0) ->
case current_sysmem_percent() of HWM = HWM0 * 100,
Usage when Usage > (HW * 100) -> Usage = current_sysmem_percent(),
gen_event:notify( UsageStr = list_to_binary(io_lib:format("~.2f%", [Usage])),
alarm_handler, {set_alarm, {system_memory_high_watermark, []}} Message = <<UsageStr/binary, " mem usage">>,
case Usage > HWM of
true ->
_ = emqx_alarm:activate(
high_system_memory_usage,
#{
usage => Usage,
high_watermark => HWM
},
Message
); );
_ -> _ ->
ok ok = emqx_alarm:ensure_deactivated(
end. high_system_memory_usage,
#{
usage => Usage,
high_watermark => HWM
},
Message
)
end,
ok.

View File

@ -33,8 +33,6 @@ init_per_suite(Config) ->
{cpu_check_interval, 1}, {cpu_check_interval, 1},
{cpu_high_watermark, 5}, {cpu_high_watermark, 5},
{cpu_low_watermark, 80}, {cpu_low_watermark, 80},
{mem_check_interval, 60},
{sysmem_high_watermark, 70},
{procmem_high_watermark, 5} {procmem_high_watermark, 5}
]); ]);
(_) -> (_) ->
@ -53,9 +51,9 @@ t_api(_) ->
?assertEqual(ok, emqx_os_mon:set_mem_check_interval(122000)), ?assertEqual(ok, emqx_os_mon:set_mem_check_interval(122000)),
?assertEqual(120000, emqx_os_mon:get_mem_check_interval()), ?assertEqual(120000, emqx_os_mon:get_mem_check_interval()),
?assertEqual(70, emqx_os_mon:get_sysmem_high_watermark()), ?assertEqual(0.7, emqx_os_mon:get_sysmem_high_watermark()),
?assertEqual(ok, emqx_os_mon:set_sysmem_high_watermark(0.8)), ?assertEqual(ok, emqx_os_mon:set_sysmem_high_watermark(0.8)),
?assertEqual(80, emqx_os_mon:get_sysmem_high_watermark()), ?assertEqual(0.8, emqx_os_mon:get_sysmem_high_watermark()),
?assertEqual(5, emqx_os_mon:get_procmem_high_watermark()), ?assertEqual(5, emqx_os_mon:get_procmem_high_watermark()),
?assertEqual(ok, emqx_os_mon:set_procmem_high_watermark(0.11)), ?assertEqual(ok, emqx_os_mon:set_procmem_high_watermark(0.11)),