From b817e03c08b2a30ae3e3abb916b8701d58a6ba88 Mon Sep 17 00:00:00 2001 From: zhongwencool Date: Tue, 15 Aug 2023 17:48:59 +0800 Subject: [PATCH] fix: start os_mon application temporary --- apps/emqx/src/emqx_os_mon.erl | 18 +++++---- apps/emqx/src/emqx_schema.erl | 2 +- apps/emqx/src/emqx_sys_mon.erl | 8 +++- apps/emqx/src/emqx_sys_sup.erl | 7 +--- apps/emqx/src/emqx_vm.erl | 32 ++++++++++----- apps/emqx/test/emqx_os_mon_SUITE.erl | 48 ++++++++++++++++------- apps/emqx_machine/priv/reboot_lists.eterm | 3 +- apps/emqx_management/src/emqx_mgmt.erl | 13 +++--- changes/ce/fix-11445.en.md | 2 + 9 files changed, 86 insertions(+), 47 deletions(-) create mode 100644 changes/ce/fix-11445.en.md diff --git a/apps/emqx/src/emqx_os_mon.erl b/apps/emqx/src/emqx_os_mon.erl index f84636a84..ec2fb1d5f 100644 --- a/apps/emqx/src/emqx_os_mon.erl +++ b/apps/emqx/src/emqx_os_mon.erl @@ -38,15 +38,14 @@ %% gen_server callbacks -export([ init/1, + handle_continue/2, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3 ]). --ifdef(TEST). --export([is_sysmem_check_supported/0]). --endif. +-export([is_os_check_supported/0]). -include("emqx.hrl"). @@ -83,12 +82,17 @@ current_sysmem_percent() -> %%-------------------------------------------------------------------- init([]) -> + %% start os_mon temporarily + {ok, _} = application:ensure_all_started(os_mon), + {ok, undefined, {continue, setup}}. + +handle_continue(setup, undefined) -> %% memsup is not reliable, ignore memsup:set_sysmem_high_watermark(1.0), SysHW = init_os_monitor(), MemRef = start_mem_check_timer(), CpuRef = start_cpu_check_timer(), - {ok, #{sysmem_high_watermark => SysHW, mem_time_ref => MemRef, cpu_time_ref => CpuRef}}. + {noreply, #{sysmem_high_watermark => SysHW, mem_time_ref => MemRef, cpu_time_ref => CpuRef}}. init_os_monitor() -> init_os_monitor(emqx:get_config([sysmon, os])). @@ -182,12 +186,12 @@ start_cpu_check_timer() -> _ -> start_timer(Interval, cpu_check) end. -is_sysmem_check_supported() -> +is_os_check_supported() -> {unix, linux} =:= os:type(). start_mem_check_timer() -> Interval = emqx:get_config([sysmon, os, mem_check_interval]), - case is_integer(Interval) andalso is_sysmem_check_supported() of + case is_integer(Interval) andalso is_os_check_supported() of true -> start_timer(Interval, mem_check); false -> @@ -205,7 +209,7 @@ update_mem_alarm_status(HWM) when HWM > 1.0 orelse HWM < 0.0 -> <<"Deactivated mem usage alarm due to out of range threshold">> ); update_mem_alarm_status(HWM) -> - is_sysmem_check_supported() andalso + is_os_check_supported() andalso do_update_mem_alarm_status(HWM), ok. diff --git a/apps/emqx/src/emqx_schema.erl b/apps/emqx/src/emqx_schema.erl index d746d3393..14881d54d 100644 --- a/apps/emqx/src/emqx_schema.erl +++ b/apps/emqx/src/emqx_schema.erl @@ -3659,7 +3659,7 @@ shared_subscription_strategy() -> )}. default_mem_check_interval() -> - case emqx_sys_sup:is_os_mon_supported() of + case emqx_os_mon:is_os_check_supported() of true -> <<"60s">>; false -> disabled end. diff --git a/apps/emqx/src/emqx_sys_mon.erl b/apps/emqx/src/emqx_sys_mon.erl index f1190f586..1d3d32199 100644 --- a/apps/emqx/src/emqx_sys_mon.erl +++ b/apps/emqx/src/emqx_sys_mon.erl @@ -29,6 +29,7 @@ %% gen_server callbacks -export([ init/1, + handle_continue/2, handle_call/3, handle_cast/2, handle_info/2, @@ -70,11 +71,14 @@ update(VM) -> init([]) -> emqx_logger:set_proc_metadata(#{sysmon => true}), - init_system_monitor(), + {ok, undefined, {continue, setup}}. +handle_continue(setup, undefined) -> + init_system_monitor(), %% Monitor cluster partition event ekka:monitor(partition, fun handle_partition_event/1), - {ok, start_timer(#{timer => undefined, events => []})}. + NewState = start_timer(#{timer => undefined, events => []}), + {noreply, NewState, hibernate}. start_timer(State) -> State#{timer := emqx_utils:start_timer(timer:seconds(2), reset)}. diff --git a/apps/emqx/src/emqx_sys_sup.erl b/apps/emqx/src/emqx_sys_sup.erl index 75f4976c9..25718ba76 100644 --- a/apps/emqx/src/emqx_sys_sup.erl +++ b/apps/emqx/src/emqx_sys_sup.erl @@ -19,8 +19,6 @@ -behaviour(supervisor). -export([start_link/0]). --export([is_os_mon_supported/0]). - -export([init/1]). start_link() -> @@ -28,7 +26,7 @@ start_link() -> init([]) -> OsMon = - case is_os_mon_supported() of + case emqx_os_mon:is_os_check_supported() of true -> [child_spec(emqx_os_mon)]; false -> [] end, @@ -45,9 +43,6 @@ init([]) -> %% Internal functions %%-------------------------------------------------------------------- -is_os_mon_supported() -> - erlang:function_exported(memsup, get_procmem_high_watermark, 0). - child_spec(Mod) -> child_spec(Mod, []). diff --git a/apps/emqx/src/emqx_vm.erl b/apps/emqx/src/emqx_vm.erl index 0d861f671..d3f98e06c 100644 --- a/apps/emqx/src/emqx_vm.erl +++ b/apps/emqx/src/emqx_vm.erl @@ -44,7 +44,7 @@ get_otp_version/0 ]). --export([cpu_util/0]). +-export([cpu_util/0, cpu_util/1]). -ifdef(TEST). -compile(export_all). @@ -378,18 +378,30 @@ avg15() -> cpu_util() -> compat_windows(fun cpu_sup:util/0). +cpu_util(Args) -> + compat_windows(fun cpu_sup:util/1, Args). + compat_windows(Fun) -> - case os:type() of - {win32, nt} -> - 0.0; - _Type -> - case catch Fun() of - Val when is_float(Val) -> floor(Val * 100) / 100; - Val when is_number(Val) -> Val; - _Error -> 0.0 - end + case compat_windows(Fun, []) of + Val when is_float(Val) -> floor(Val * 100) / 100; + Val when is_number(Val) -> Val; + _ -> 0.0 end. +compat_windows(Fun, Args) -> + try + case is_windows() of + true -> 0.0; + false when Args =:= [] -> Fun(); + false -> Fun(Args) + end + catch + _:_ -> 0.0 + end. + +is_windows() -> + os:type() =:= {win32, nt}. + load(Avg) -> floor((Avg / 256) * 100) / 100. diff --git a/apps/emqx/test/emqx_os_mon_SUITE.erl b/apps/emqx/test/emqx_os_mon_SUITE.erl index e76928114..1833be48e 100644 --- a/apps/emqx/test/emqx_os_mon_SUITE.erl +++ b/apps/emqx/test/emqx_os_mon_SUITE.erl @@ -39,29 +39,47 @@ init_per_testcase(t_cpu_check_alarm, Config) -> %% 200ms cpu_check_interval => 200 }), - ok = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon), - {ok, _} = supervisor:restart_child(emqx_sys_sup, emqx_os_mon), + restart_os_mon(), Config; init_per_testcase(t_sys_mem_check_alarm, Config) -> - case emqx_os_mon:is_sysmem_check_supported() of + case emqx_os_mon:is_os_check_supported() of true -> SysMon = emqx_config:get([sysmon, os], #{}), emqx_config:put([sysmon, os], SysMon#{ sysmem_high_watermark => 0.51, %% 200ms mem_check_interval => 200 - }), - ok = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon), - {ok, _} = supervisor:restart_child(emqx_sys_sup, emqx_os_mon), - Config; + }); false -> - Config - end; + ok + end, + restart_os_mon(), + Config; init_per_testcase(_, Config) -> - emqx_common_test_helpers:boot_modules(all), - emqx_common_test_helpers:start_apps([]), + restart_os_mon(), Config. +restart_os_mon() -> + case emqx_os_mon:is_os_check_supported() of + true -> + ok = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon), + {ok, _} = supervisor:restart_child(emqx_sys_sup, emqx_os_mon); + false -> + _ = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon), + _ = supervisor:delete_child(emqx_sys_sup, emqx_os_mon), + %% run test on mac/windows. + Mod = emqx_os_mon, + OsMon = #{ + id => Mod, + start => {Mod, start_link, []}, + restart => permanent, + shutdown => 5000, + type => worker, + modules => [Mod] + }, + {ok, _} = supervisor:start_child(emqx_sys_sup, OsMon) + end. + t_api(_) -> ?assertEqual(0.7, emqx_os_mon:get_sysmem_high_watermark()), ?assertEqual(ok, emqx_os_mon:set_sysmem_high_watermark(0.8)), @@ -81,7 +99,7 @@ t_api(_) -> ok. t_sys_mem_check_disable(Config) -> - case emqx_os_mon:is_sysmem_check_supported() of + case emqx_os_mon:is_os_check_supported() of true -> do_sys_mem_check_disable(Config); false -> skip end. @@ -100,7 +118,7 @@ do_sys_mem_check_disable(_Config) -> ok. t_sys_mem_check_alarm(Config) -> - case emqx_os_mon:is_sysmem_check_supported() of + case emqx_os_mon:is_os_check_supported() of true -> do_sys_mem_check_alarm(Config); false -> skip end. @@ -167,7 +185,7 @@ t_cpu_check_alarm(_) -> util, fun() -> CpuUtil end, fun() -> - timer:sleep(500), + timer:sleep(1000), Alarms = emqx_alarm:get_alarms(activated), ?assert( emqx_vm_mon_SUITE:is_existing(high_cpu_usage, emqx_alarm:get_alarms(activated)) @@ -193,7 +211,7 @@ t_cpu_check_alarm(_) -> ?assert(is_binary(Msg)), emqx_config:put([sysmon, os, cpu_high_watermark], 1), emqx_config:put([sysmon, os, cpu_low_watermark], 0.96), - timer:sleep(500), + timer:sleep(800), ?assertNot( emqx_vm_mon_SUITE:is_existing(high_cpu_usage, emqx_alarm:get_alarms(activated)) ) diff --git a/apps/emqx_machine/priv/reboot_lists.eterm b/apps/emqx_machine/priv/reboot_lists.eterm index 09c7fba37..0e2ecb799 100644 --- a/apps/emqx_machine/priv/reboot_lists.eterm +++ b/apps/emqx_machine/priv/reboot_lists.eterm @@ -17,7 +17,8 @@ asn1, syntax_tools, ssl, - os_mon, + %% started temporary in emqx to prevent crash vm when permanent. + {os_mon, load}, inets, compiler, runtime_tools, diff --git a/apps/emqx_management/src/emqx_mgmt.erl b/apps/emqx_management/src/emqx_mgmt.erl index 059c323ff..e00044add 100644 --- a/apps/emqx_management/src/emqx_mgmt.erl +++ b/apps/emqx_management/src/emqx_mgmt.erl @@ -197,13 +197,16 @@ vm_stats() -> ]. vm_stats('cpu.idle') -> - case cpu_sup:util([detailed]) of - %% Not support for Windows - {_, 0, 0, _} -> 0; - {_Num, _Use, IdleList, _} -> proplists:get_value(idle, IdleList, 0) + case emqx_vm:cpu_util([detailed]) of + {_Num, _Use, List, _} when is_list(List) -> proplists:get_value(idle, List, 0); + %% return {all, 0, 0, []} when cpu_sup is not started + _ -> 0 end; vm_stats('cpu.use') -> - 100 - vm_stats('cpu.idle'); + case vm_stats('cpu.idle') of + 0 -> 0; + Idle -> 100 - Idle + end; vm_stats('total.memory') -> {_, MemTotal} = get_sys_memory(), MemTotal; diff --git a/changes/ce/fix-11445.en.md b/changes/ce/fix-11445.en.md new file mode 100644 index 000000000..589846db2 --- /dev/null +++ b/changes/ce/fix-11445.en.md @@ -0,0 +1,2 @@ +Removed os_mon application monitor support on Windows platforms to prevent VM crashes. +Functionality remains on non-Windows platforms.