Merge pull request #12744 from JimMoen/EMQX-12046/fix-otel-cpu-sup
fix(otel): cpu usage/idle metrics for opentelemetry
This commit is contained in:
commit
5390203184
|
@ -124,9 +124,12 @@ ensure_otel_metrics(
|
||||||
) ->
|
) ->
|
||||||
ok;
|
ok;
|
||||||
ensure_otel_metrics(#{metrics := #{enable := true}} = Conf, _Old) ->
|
ensure_otel_metrics(#{metrics := #{enable := true}} = Conf, _Old) ->
|
||||||
|
ok = emqx_otel_cpu_sup:stop_otel_cpu_sup(),
|
||||||
|
_ = emqx_otel_cpu_sup:start_otel_cpu_sup(Conf),
|
||||||
_ = emqx_otel_metrics:stop_otel(),
|
_ = emqx_otel_metrics:stop_otel(),
|
||||||
emqx_otel_metrics:start_otel(Conf);
|
emqx_otel_metrics:start_otel(Conf);
|
||||||
ensure_otel_metrics(#{metrics := #{enable := false}}, _Old) ->
|
ensure_otel_metrics(#{metrics := #{enable := false}}, _Old) ->
|
||||||
|
ok = emqx_otel_cpu_sup:stop_otel_cpu_sup(),
|
||||||
emqx_otel_metrics:stop_otel();
|
emqx_otel_metrics:stop_otel();
|
||||||
ensure_otel_metrics(_, _) ->
|
ensure_otel_metrics(_, _) ->
|
||||||
ok.
|
ok.
|
||||||
|
|
|
@ -0,0 +1,146 @@
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
|
||||||
|
%%
|
||||||
|
%% Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
%% you may not use this file except in compliance with the License.
|
||||||
|
%% You may obtain a copy of the License at
|
||||||
|
%%
|
||||||
|
%% http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
%%
|
||||||
|
%% Unless required by applicable law or agreed to in writing, software
|
||||||
|
%% distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
%% See the License for the specific language governing permissions and
|
||||||
|
%% limitations under the License.
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
|
||||||
|
-module(emqx_otel_cpu_sup).
|
||||||
|
|
||||||
|
-behaviour(gen_server).
|
||||||
|
|
||||||
|
-include_lib("emqx/include/logger.hrl").
|
||||||
|
|
||||||
|
%% gen_server APIs
|
||||||
|
-export([start_link/1]).
|
||||||
|
|
||||||
|
-export([
|
||||||
|
start_otel_cpu_sup/1,
|
||||||
|
stop_otel_cpu_sup/0,
|
||||||
|
stats/1
|
||||||
|
]).
|
||||||
|
|
||||||
|
%% gen_server callbacks
|
||||||
|
-export([
|
||||||
|
init/1,
|
||||||
|
handle_continue/2,
|
||||||
|
handle_call/3,
|
||||||
|
handle_cast/2,
|
||||||
|
handle_info/2,
|
||||||
|
terminate/2,
|
||||||
|
code_change/3
|
||||||
|
]).
|
||||||
|
|
||||||
|
-define(REFRESH, refresh).
|
||||||
|
-define(OTEL_CPU_USAGE_WORKER, ?MODULE).
|
||||||
|
-define(SUPERVISOR, emqx_otel_sup).
|
||||||
|
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
%% API
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
|
||||||
|
start_otel_cpu_sup(Conf) ->
|
||||||
|
Spec = emqx_otel_sup:worker_spec(?MODULE, Conf),
|
||||||
|
assert_started(supervisor:start_child(?SUPERVISOR, Spec)).
|
||||||
|
|
||||||
|
stop_otel_cpu_sup() ->
|
||||||
|
case erlang:whereis(?SUPERVISOR) of
|
||||||
|
undefined ->
|
||||||
|
ok;
|
||||||
|
Pid ->
|
||||||
|
case supervisor:terminate_child(Pid, ?MODULE) of
|
||||||
|
ok -> supervisor:delete_child(Pid, ?MODULE);
|
||||||
|
{error, not_found} -> ok;
|
||||||
|
Error -> Error
|
||||||
|
end
|
||||||
|
end.
|
||||||
|
|
||||||
|
stats(Name) ->
|
||||||
|
gen_server:call(?OTEL_CPU_USAGE_WORKER, {?FUNCTION_NAME, Name}, infinity).
|
||||||
|
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
%% gen_server callbacks
|
||||||
|
%% simply handle cpu_sup:util/0,1 called in one process
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
|
||||||
|
start_link(Conf) ->
|
||||||
|
gen_server:start_link({local, ?OTEL_CPU_USAGE_WORKER}, ?MODULE, Conf, []).
|
||||||
|
|
||||||
|
init(Conf) ->
|
||||||
|
{ok, _InitState = #{}, {continue, {setup, Conf}}}.
|
||||||
|
|
||||||
|
%% Interval in milliseconds
|
||||||
|
handle_continue({setup, #{metrics := #{enable := true, interval := Interval}}}, State) ->
|
||||||
|
%% start os_mon temporarily
|
||||||
|
{ok, _} = application:ensure_all_started(os_mon),
|
||||||
|
%% The returned value of the first call to cpu_sup:util/0 or cpu_sup:util/1 by a
|
||||||
|
%% process will on most systems be the CPU utilization since system boot,
|
||||||
|
%% but this is not guaranteed and the value should therefore be regarded as garbage.
|
||||||
|
%% This also applies to the first call after a restart of cpu_sup.
|
||||||
|
_Val = cpu_sup:util(),
|
||||||
|
TRef = start_refresh_timer(Interval),
|
||||||
|
{noreply, State#{interval => Interval, refresh_time_ref => TRef}}.
|
||||||
|
|
||||||
|
handle_call({stats, Name}, _From, State) ->
|
||||||
|
{reply, get_stats(Name, State), State};
|
||||||
|
handle_call(stop, _From, State) ->
|
||||||
|
cancel_outdated_timer(State),
|
||||||
|
{stop, normal, State};
|
||||||
|
handle_call(Req, _From, State) ->
|
||||||
|
?SLOG(error, #{msg => "unexpected_call", call => Req}),
|
||||||
|
{reply, ignored, State}.
|
||||||
|
|
||||||
|
handle_cast(Msg, State) ->
|
||||||
|
?SLOG(error, #{msg => "unexpected_cast", cast => Msg}),
|
||||||
|
{noreply, State}.
|
||||||
|
|
||||||
|
handle_info({timeout, _Timer, ?REFRESH}, State) ->
|
||||||
|
{noreply, refresh(State)}.
|
||||||
|
|
||||||
|
terminate(_Reason, _State) ->
|
||||||
|
ok.
|
||||||
|
|
||||||
|
code_change(_OldVsn, State, _Extra) ->
|
||||||
|
{ok, State}.
|
||||||
|
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
%% Internal functions
|
||||||
|
%%--------------------------------------------------------------------
|
||||||
|
|
||||||
|
refresh(#{interval := Interval} = State) ->
|
||||||
|
NState =
|
||||||
|
case cpu_sup:util([]) of
|
||||||
|
{all, U, I, _} ->
|
||||||
|
State#{'cpu.use' => U, 'cpu.idle' => I};
|
||||||
|
_ ->
|
||||||
|
State#{'cpu.use' => 0, 'cpu.idle' => 0}
|
||||||
|
end,
|
||||||
|
TRef = start_refresh_timer(Interval),
|
||||||
|
NState#{refresh_time_ref => TRef}.
|
||||||
|
|
||||||
|
get_stats(Name, State) ->
|
||||||
|
maps:get(Name, State, 0).
|
||||||
|
|
||||||
|
cancel_outdated_timer(#{refresh_time_ref := TRef}) ->
|
||||||
|
emqx_utils:cancel_timer(TRef),
|
||||||
|
ok.
|
||||||
|
|
||||||
|
start_refresh_timer(Interval) ->
|
||||||
|
start_timer(Interval, ?REFRESH).
|
||||||
|
|
||||||
|
start_timer(Interval, Msg) ->
|
||||||
|
emqx_utils:start_timer(Interval, Msg).
|
||||||
|
|
||||||
|
assert_started({ok, _Pid}) -> ok;
|
||||||
|
assert_started({ok, _Pid, _Info}) -> ok;
|
||||||
|
assert_started({error, {already_started, _Pid}}) -> ok;
|
||||||
|
assert_started({error, Reason}) -> {error, Reason}.
|
|
@ -197,6 +197,10 @@ bytes_metrics() ->
|
||||||
get_stats_gauge(Name) ->
|
get_stats_gauge(Name) ->
|
||||||
[{emqx_stats:getstat(Name), #{}}].
|
[{emqx_stats:getstat(Name), #{}}].
|
||||||
|
|
||||||
|
get_vm_gauge('cpu.use') ->
|
||||||
|
[{emqx_otel_cpu_sup:stats('cpu.use'), #{}}];
|
||||||
|
get_vm_gauge('cpu.idle') ->
|
||||||
|
[{emqx_otel_cpu_sup:stats('cpu.idle'), #{}}];
|
||||||
get_vm_gauge(Name) ->
|
get_vm_gauge(Name) ->
|
||||||
[{emqx_mgmt:vm_stats(Name), #{}}].
|
[{emqx_mgmt:vm_stats(Name), #{}}].
|
||||||
|
|
||||||
|
@ -254,8 +258,6 @@ create_counter(Meter, Counters, CallBack) ->
|
||||||
Counters
|
Counters
|
||||||
).
|
).
|
||||||
|
|
||||||
%% Note: list_to_existing_atom("cpu.use") will crash
|
|
||||||
%% so we make sure the atom is already existing here
|
|
||||||
normalize_name(cpu_use) ->
|
normalize_name(cpu_use) ->
|
||||||
'cpu.use';
|
'cpu.use';
|
||||||
normalize_name(cpu_idle) ->
|
normalize_name(cpu_idle) ->
|
||||||
|
|
|
@ -42,7 +42,12 @@ init([]) ->
|
||||||
},
|
},
|
||||||
Children =
|
Children =
|
||||||
case emqx_conf:get([opentelemetry]) of
|
case emqx_conf:get([opentelemetry]) of
|
||||||
#{metrics := #{enable := false}} -> [];
|
#{metrics := #{enable := false}} ->
|
||||||
#{metrics := #{enable := true}} = Conf -> [worker_spec(emqx_otel_metrics, Conf)]
|
[];
|
||||||
|
#{metrics := #{enable := true}} = Conf ->
|
||||||
|
[
|
||||||
|
worker_spec(emqx_otel_metrics, Conf),
|
||||||
|
worker_spec(emqx_otel_cpu_sup, Conf)
|
||||||
|
]
|
||||||
end,
|
end,
|
||||||
{ok, {SupFlags, Children}}.
|
{ok, {SupFlags, Children}}.
|
||||||
|
|
Loading…
Reference in New Issue