From 1ea06393212d2a0e7d97b3a167928b103a203c3a Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Fri, 13 May 2022 12:44:27 +0200 Subject: [PATCH] fix(os_mon): do mem check and alarm triggers in emqx_os_mon --- apps/emqx/src/emqx_alarm.erl | 12 ++ apps/emqx/src/emqx_alarm_handler.erl | 15 --- apps/emqx/src/emqx_os_mon.erl | 160 +++++++++++++++++---------- apps/emqx/test/emqx_os_mon_SUITE.erl | 6 +- 4 files changed, 116 insertions(+), 77 deletions(-) diff --git a/apps/emqx/src/emqx_alarm.erl b/apps/emqx/src/emqx_alarm.erl index 9ef410299..eca1aeed2 100644 --- a/apps/emqx/src/emqx_alarm.erl +++ b/apps/emqx/src/emqx_alarm.erl @@ -35,6 +35,7 @@ deactivate/1, deactivate/2, deactivate/3, + ensure_deactivated/3, delete_all_deactivated_alarms/0, get_alarms/0, get_alarms/1, @@ -120,6 +121,17 @@ deactivate(Name) -> deactivate(Name, Details) -> deactivate(Name, Details, <<"">>). +ensure_deactivated(Name, Details, Message) -> + case mnesia:dirty_read(?ACTIVATED_ALARM, Name) of + [] -> + ok; + _ -> + case deactivate(Name, Details, Message) of + {error, not_found} -> ok; + Other -> Other + end + end. + deactivate(Name, Details, Message) -> gen_server:call(?MODULE, {deactivate_alarm, Name, Details, Message}). diff --git a/apps/emqx/src/emqx_alarm_handler.erl b/apps/emqx/src/emqx_alarm_handler.erl index 2ba280f44..66d2303d5 100644 --- a/apps/emqx/src/emqx_alarm_handler.erl +++ b/apps/emqx/src/emqx_alarm_handler.erl @@ -56,18 +56,6 @@ init({_Args, {alarm_handler, _ExistingAlarms}}) -> init(_) -> {ok, []}. -handle_event({set_alarm, {system_memory_high_watermark, []}}, State) -> - HighWatermark = emqx_os_mon:get_sysmem_high_watermark(), - Message = to_bin("System memory usage is higher than ~p%", [HighWatermark]), - emqx_alarm:activate( - high_system_memory_usage, - #{ - high_watermark => HighWatermark, - percent => emqx_os_mon:current_sysmem_percent() - }, - Message - ), - {ok, State}; handle_event({set_alarm, {process_memory_high_watermark, Pid}}, State) -> HighWatermark = emqx_os_mon:get_procmem_high_watermark(), Message = to_bin("Process memory usage is higher than ~p%", [HighWatermark]), @@ -80,9 +68,6 @@ handle_event({set_alarm, {process_memory_high_watermark, Pid}}, State) -> Message ), {ok, State}; -handle_event({clear_alarm, system_memory_high_watermark}, State) -> - _ = emqx_alarm:deactivate(high_system_memory_usage), - {ok, State}; handle_event({clear_alarm, process_memory_high_watermark}, State) -> _ = emqx_alarm:deactivate(high_process_memory_usage), {ok, State}; diff --git a/apps/emqx/src/emqx_os_mon.erl b/apps/emqx/src/emqx_os_mon.erl index d4766c29a..c0960abd6 100644 --- a/apps/emqx/src/emqx_os_mon.erl +++ b/apps/emqx/src/emqx_os_mon.erl @@ -65,10 +65,10 @@ set_mem_check_interval(Seconds) -> memsup:set_check_interval(Seconds div 60000). get_sysmem_high_watermark() -> - memsup:get_sysmem_high_watermark(). + gen_server:call(?OS_MON, ?FUNCTION_NAME, infinity). set_sysmem_high_watermark(Float) -> - memsup:set_sysmem_high_watermark(Float). + gen_server:call(?OS_MON, {?FUNCTION_NAME, Float}, infinity). get_procmem_high_watermark() -> memsup:get_procmem_high_watermark(). @@ -79,7 +79,7 @@ set_procmem_high_watermark(Float) -> current_sysmem_percent() -> case load_ctl:get_memory_usage() of 0 -> - undefined; + 0; Ratio -> erlang:floor(Ratio * 10000) / 100 end. @@ -89,19 +89,26 @@ current_sysmem_percent() -> %%-------------------------------------------------------------------- init([]) -> + %% memsup is not reliable, ignore + memsup:set_sysmem_high_watermark(1.0), #{ sysmem_high_watermark := SysHW, procmem_high_watermark := PHW, mem_check_interval := MCI } = emqx:get_config([sysmon, os]), - set_sysmem_high_watermark(SysHW), set_procmem_high_watermark(PHW), set_mem_check_interval(MCI), - ensure_system_memory_alarm(SysHW), - _ = start_check_timer(), - {ok, #{}}. + update_mem_alarm_stauts(SysHW), + _ = start_mem_check_timer(), + _ = start_cpu_check_timer(), + {ok, #{sysmem_high_watermark => SysHW}}. +handle_call(get_sysmem_high_watermark, _From, #{sysmem_high_watermark := HWM} = State) -> + {reply, HWM, State}; +handle_call({set_sysmem_high_watermark, New}, _From, #{sysmem_high_watermark := _Old} = State) -> + ok = update_mem_alarm_stauts(New), + {reply, ok, State#{sysmem_high_watermark := New}}; handle_call(Req, _From, State) -> {reply, {error, {unexpected_call, Req}}, State}. @@ -109,43 +116,45 @@ handle_cast(Msg, State) -> ?SLOG(error, #{msg => "unexpected_cast", cast => Msg}), {noreply, State}. -handle_info({timeout, _Timer, check}, State) -> +handle_info({timeout, _Timer, mem_check}, #{sysmem_high_watermark := HWM} = State) -> + ok = update_mem_alarm_stauts(HWM), + ok = start_mem_check_timer(), + {noreply, State}; +handle_info({timeout, _Timer, cpu_check}, State) -> CPUHighWatermark = emqx:get_config([sysmon, os, cpu_high_watermark]) * 100, CPULowWatermark = emqx:get_config([sysmon, os, cpu_low_watermark]) * 100, %% TODO: should be improved? - _ = - case emqx_vm:cpu_util() of - 0 -> - ok; - Busy when Busy > CPUHighWatermark -> - Usage = list_to_binary(io_lib:format("~.2f%", [Busy])), - Message = <>, - emqx_alarm:activate( - high_cpu_usage, - #{ - usage => Usage, - high_watermark => CPUHighWatermark, - low_watermark => CPULowWatermark - }, - Message - ), - start_check_timer(); - Busy when Busy < CPULowWatermark -> - Usage = list_to_binary(io_lib:format("~.2f%", [Busy])), - Message = <>, - emqx_alarm:deactivate( - high_cpu_usage, - #{ - usage => Usage, - high_watermark => CPUHighWatermark, - low_watermark => CPULowWatermark - }, - Message - ), - start_check_timer(); - _Busy -> - start_check_timer() - end, + case emqx_vm:cpu_util() of + 0 -> + ok; + Busy when Busy > CPUHighWatermark -> + Usage = list_to_binary(io_lib:format("~.2f%", [Busy])), + Message = <>, + _ = emqx_alarm:activate( + high_cpu_usage, + #{ + usage => Usage, + high_watermark => CPUHighWatermark, + low_watermark => CPULowWatermark + }, + Message + ); + Busy when Busy < CPULowWatermark -> + Usage = list_to_binary(io_lib:format("~.2f%", [Busy])), + Message = <>, + ok = emqx_alarm:ensure_deactivated( + high_cpu_usage, + #{ + usage => Usage, + high_watermark => CPUHighWatermark, + low_watermark => CPULowWatermark + }, + Message + ); + _Busy -> + ok + end, + ok = start_cpu_check_timer(), {noreply, State}; handle_info(Info, State) -> ?SLOG(error, #{msg => "unexpected_info", info => Info}), @@ -161,26 +170,61 @@ code_change(_OldVsn, State, _Extra) -> %% Internal functions %%-------------------------------------------------------------------- -start_check_timer() -> +start_cpu_check_timer() -> Interval = emqx:get_config([sysmon, os, cpu_check_interval]), case erlang:system_info(system_architecture) of "x86_64-pc-linux-musl" -> ok; - _ -> emqx_misc:start_timer(Interval, check) - end. + _ -> _ = emqx_misc:start_timer(Interval, cpu_check) + end, + ok. +start_mem_check_timer() -> + Interval = emqx:get_config([sysmon, os, mem_check_interval]), + IsSupported = + case os:type() of + {unix, linux} -> + true; + _ -> + %% sorry Mac and windows, for now + false + end, + case is_integer(Interval) andalso IsSupported of + true -> + _ = emqx_misc:start_timer(Interval, mem_check); + false -> + ok + end, + ok. -%% At startup, memsup starts first and checks for memory alarms, -%% but emqx_alarm_handler is not yet used instead of alarm_handler, -%% so alarm_handler is used directly for notification (normally emqx_alarm_handler should be used). -%%The internal memsup will no longer trigger events that have been alerted, -%% and there is no exported function to remove the alerted flag, -%% so it can only be checked again at startup. - -ensure_system_memory_alarm(HW) when HW =< 1.0 andalso HW >= 0 -> - case current_sysmem_percent() of - Usage when Usage > (HW * 100) -> - gen_event:notify( - alarm_handler, {set_alarm, {system_memory_high_watermark, []}} +update_mem_alarm_stauts(HWM) when HWM > 1.0 orelse HWM < 0.0 -> + ?SLOG(warning, #{msg => "discarded_out_of_range_mem_alarm_threshold", value => HWM}), + ok = emqx_alarm:ensure_deactivated( + high_system_memory_usage, + #{}, + <<"Deactivated mem usage alarm due to out of range threshold">> + ); +update_mem_alarm_stauts(HWM0) -> + HWM = HWM0 * 100, + Usage = current_sysmem_percent(), + UsageStr = list_to_binary(io_lib:format("~.2f%", [Usage])), + Message = <>, + case Usage > HWM of + true -> + _ = emqx_alarm:activate( + high_system_memory_usage, + #{ + usage => Usage, + high_watermark => HWM + }, + Message ); _ -> - ok - end. + ok = emqx_alarm:ensure_deactivated( + high_system_memory_usage, + #{ + usage => Usage, + high_watermark => HWM + }, + Message + ) + end, + ok. diff --git a/apps/emqx/test/emqx_os_mon_SUITE.erl b/apps/emqx/test/emqx_os_mon_SUITE.erl index 38bc2acf2..c558669af 100644 --- a/apps/emqx/test/emqx_os_mon_SUITE.erl +++ b/apps/emqx/test/emqx_os_mon_SUITE.erl @@ -33,8 +33,6 @@ init_per_suite(Config) -> {cpu_check_interval, 1}, {cpu_high_watermark, 5}, {cpu_low_watermark, 80}, - {mem_check_interval, 60}, - {sysmem_high_watermark, 70}, {procmem_high_watermark, 5} ]); (_) -> @@ -53,9 +51,9 @@ t_api(_) -> ?assertEqual(ok, emqx_os_mon:set_mem_check_interval(122000)), ?assertEqual(120000, emqx_os_mon:get_mem_check_interval()), - ?assertEqual(70, emqx_os_mon:get_sysmem_high_watermark()), + ?assertEqual(0.7, emqx_os_mon:get_sysmem_high_watermark()), ?assertEqual(ok, emqx_os_mon:set_sysmem_high_watermark(0.8)), - ?assertEqual(80, emqx_os_mon:get_sysmem_high_watermark()), + ?assertEqual(0.8, emqx_os_mon:get_sysmem_high_watermark()), ?assertEqual(5, emqx_os_mon:get_procmem_high_watermark()), ?assertEqual(ok, emqx_os_mon:set_procmem_high_watermark(0.11)),