fix: /api/nodes is timeout if emqx in high load

This commit is contained in:
Zhongwen Deng 2023-01-29 10:25:28 +08:00
parent 7e8253e3af
commit 2d67bb3fb6
6 changed files with 109 additions and 11 deletions

View File

@ -130,8 +130,10 @@ handle_info({timeout, _Timer, mem_check}, #{sysmem_high_watermark := HWM} = Stat
handle_info({timeout, _Timer, cpu_check}, State) -> handle_info({timeout, _Timer, cpu_check}, State) ->
CPUHighWatermark = emqx:get_config([sysmon, os, cpu_high_watermark]) * 100, CPUHighWatermark = emqx:get_config([sysmon, os, cpu_high_watermark]) * 100,
CPULowWatermark = emqx:get_config([sysmon, os, cpu_low_watermark]) * 100, CPULowWatermark = emqx:get_config([sysmon, os, cpu_low_watermark]) * 100,
case emqx_vm:cpu_util() of CPUVal = emqx_vm:cpu_util(),
0 -> case CPUVal of
%% 0 or 0.0
Busy when Busy == 0 ->
ok; ok;
Busy when Busy > CPUHighWatermark -> Busy when Busy > CPUHighWatermark ->
_ = emqx_alarm:activate( _ = emqx_alarm:activate(
@ -236,5 +238,5 @@ do_update_mem_alarm_status(HWM0) ->
ok. ok.
usage_msg(Usage, What) -> usage_msg(Usage, What) ->
%% devide by 1.0 to ensure float point number %% divide by 1.0 to ensure float point number
iolist_to_binary(io_lib:format("~.2f% ~p usage", [Usage / 1.0, What])). iolist_to_binary(io_lib:format("~.2f% ~p usage", [Usage / 1.0, What])).

View File

@ -232,8 +232,10 @@ mem_info() ->
Free = proplists:get_value(free_memory, Dataset), Free = proplists:get_value(free_memory, Dataset),
[{total_memory, Total}, {used_memory, Total - Free}]. [{total_memory, Total}, {used_memory, Total - Free}].
ftos(F) -> ftos(F) when is_float(F) ->
io_lib:format("~.2f", [F / 1.0]). float_to_binary(F, [{decimals, 2}]);
ftos(F) when is_integer(F) ->
ftos(F / 1.0).
%%%% erlang vm scheduler_usage fun copied from recon %%%% erlang vm scheduler_usage fun copied from recon
scheduler_usage(Interval) when is_integer(Interval) -> scheduler_usage(Interval) when is_integer(Interval) ->
@ -391,11 +393,12 @@ cpu_util() ->
compat_windows(Fun) -> compat_windows(Fun) ->
case os:type() of case os:type() of
{win32, nt} -> {win32, nt} ->
0; 0.0;
_Type -> _Type ->
case catch Fun() of case catch Fun() of
Val when is_float(Val) -> floor(Val * 100) / 100;
Val when is_number(Val) -> Val; Val when is_number(Val) -> Val;
_Error -> 0 _Error -> 0.0
end end
end. end.

View File

@ -63,7 +63,7 @@ handle_info({timeout, _Timer, check}, State) ->
ProcessCount = erlang:system_info(process_count), ProcessCount = erlang:system_info(process_count),
case ProcessCount / erlang:system_info(process_limit) of case ProcessCount / erlang:system_info(process_limit) of
Percent when Percent > ProcHighWatermark -> Percent when Percent > ProcHighWatermark ->
Usage = io_lib:format("~p%", [Percent * 100]), Usage = usage(Percent),
Message = [Usage, " process usage"], Message = [Usage, " process usage"],
emqx_alarm:activate( emqx_alarm:activate(
too_many_processes, too_many_processes,
@ -75,7 +75,7 @@ handle_info({timeout, _Timer, check}, State) ->
Message Message
); );
Percent when Percent < ProcLowWatermark -> Percent when Percent < ProcLowWatermark ->
Usage = io_lib:format("~p%", [Percent * 100]), Usage = usage(Percent),
Message = [Usage, " process usage"], Message = [Usage, " process usage"],
emqx_alarm:ensure_deactivated( emqx_alarm:ensure_deactivated(
too_many_processes, too_many_processes,
@ -108,3 +108,6 @@ code_change(_OldVsn, State, _Extra) ->
start_check_timer() -> start_check_timer() ->
Interval = emqx:get_config([sysmon, vm, process_check_interval]), Interval = emqx:get_config([sysmon, vm, process_check_interval]),
emqx_misc:start_timer(Interval, check). emqx_misc:start_timer(Interval, check).
usage(Percent) ->
integer_to_list(floor(Percent * 100)) ++ "%".

View File

@ -150,7 +150,7 @@ node_info() ->
get_sys_memory() -> get_sys_memory() ->
case os:type() of case os:type() of
{unix, linux} -> {unix, linux} ->
load_ctl:get_sys_memory(); emqx_mgmt_sys_memory:get_sys_memory();
_ -> _ ->
{0, 0} {0, 0}
end. end.

View File

@ -26,4 +26,15 @@ start_link() ->
supervisor:start_link({local, ?MODULE}, ?MODULE, []). supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) -> init([]) ->
{ok, {{one_for_one, 1, 5}, []}}. LC = child_spec(emqx_mgmt_sys_memory, 5000, worker),
{ok, {{one_for_one, 1, 5}, [LC]}}.
child_spec(Mod, Shutdown, Type) ->
#{
id => Mod,
start => {Mod, start_link, []},
restart => permanent,
shutdown => Shutdown,
type => Type,
modules => [Mod]
}.

View File

@ -0,0 +1,79 @@
%%--------------------------------------------------------------------
%% Copyright (c) 2020-2023 EMQ Technologies Co., Ltd. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%--------------------------------------------------------------------
-module(emqx_mgmt_sys_memory).
-behaviour(gen_server).
-define(SYS_MEMORY_CACHE_KEY, ?MODULE).
-define(TIMEOUT, 3000).
-export([start_link/0, get_sys_memory/0, get_sys_memory/1]).
-export([
init/1,
handle_call/3,
handle_cast/2,
handle_info/2,
terminate/2,
code_change/3
]).
get_sys_memory() ->
get_sys_memory(?TIMEOUT).
get_sys_memory(Timeout) ->
try
gen_server:call(?MODULE, get_sys_memory, Timeout)
catch
exit:{timeout, _} ->
get_memory_from_cache()
end.
start_link() ->
gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
init([]) ->
{ok, #{last_time => 0}}.
handle_call(get_sys_memory, _From, State = #{last_time := LastTime}) ->
Now = erlang:system_time(millisecond),
case Now - LastTime >= ?TIMEOUT of
true ->
Memory = load_ctl:get_sys_memory(),
persistent_term:put(?SYS_MEMORY_CACHE_KEY, Memory),
{reply, Memory, State#{last_time => Now}};
false ->
{reply, get_memory_from_cache(), State}
end;
handle_call(_Request, _From, State = #{}) ->
{reply, ok, State}.
handle_cast(_Request, State = #{}) ->
{noreply, State}.
handle_info(_Info, State = #{}) ->
{noreply, State}.
terminate(_Reason, _State = #{}) ->
ok.
code_change(_OldVsn, State = #{}, _Extra) ->
{ok, State}.
%%%===================================================================
%%% Internal functions
%%%===================================================================
get_memory_from_cache() ->
persistent_term:get(?SYS_MEMORY_CACHE_KEY, {0, 0}).