fix(emqx_opentelemetry): avoid using `application:ensure_all_started/1` for better deadlock safety

This commit is contained in:
Serge Tupchii 2023-11-23 13:35:21 +02:00
parent 8d3f98eff2
commit 2a3f6b749c
9 changed files with 23 additions and 65 deletions

View File

@ -25,12 +25,7 @@
redbug,
xmerl,
{hocon, load},
telemetry,
{opentelemetry, load},
{opentelemetry_api, load},
{opentelemetry_experimental, load},
{opentelemetry_api_experimental, load},
{opentelemetry_exporter, load}
telemetry
],
%% must always be of type `load'
common_business_apps =>

View File

@ -50,6 +50,7 @@ start() ->
start_sysmon(),
configure_shard_transports(),
set_mnesia_extra_diagnostic_checks(),
emqx_otel_app:configure_otel_deps(),
ekka:start(),
ok.

View File

@ -69,9 +69,7 @@ stop_apps() ->
?SLOG(notice, #{msg => "stopping_emqx_apps"}),
_ = emqx_alarm_handler:unload(),
ok = emqx_conf_app:unset_config_loaded(),
lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())),
%% Mute otel deps application.
ok = emqx_otel_app:stop_deps().
lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())).
%% Those port apps are terminated after the main apps
%% Don't need to stop when reboot.

View File

@ -3,13 +3,13 @@
{deps,
[{emqx, {path, "../emqx"}}
%% trace
, {opentelemetry_api, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_api"}}
, {opentelemetry, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry"}}
%% log metrics
, {opentelemetry_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_experimental"}}
, {opentelemetry_api_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_api_experimental"}}
, {opentelemetry_api, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_api"}}
, {opentelemetry, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry"}}
%% logs, metrics
, {opentelemetry_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_experimental"}}
, {opentelemetry_api_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_api_experimental"}}
%% export
, {opentelemetry_exporter, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_exporter"}}
, {opentelemetry_exporter, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_exporter"}}
]}.
{edoc_opts, [{preprocess, true}]}.

View File

@ -8,14 +8,12 @@
stdlib,
emqx,
%% otel metrics depend on emqx_mgmt_cache
emqx_management
]},
{included_applications, [
emqx_management,
opentelemetry_exporter,
opentelemetry,
opentelemetry_api,
opentelemetry_api_experimental,
opentelemetry_experimental,
opentelemetry_exporter
opentelemetry_api,
opentelemetry_api_experimental
]},
{env, []},
{modules, []},

View File

@ -19,7 +19,7 @@
-behaviour(application).
-export([start/2, stop/1]).
-export([stop_deps/0]).
-export([configure_otel_deps/0]).
start(_StartType, _StartArgs) ->
emqx_otel_config:add_handler(),
@ -33,5 +33,12 @@ stop(_State) ->
_ = emqx_otel_config:remove_otel_log_handler(),
ok.
stop_deps() ->
emqx_otel_config:stop_all_otel_apps().
configure_otel_deps() ->
%% default tracer and metrics are started only on demand
ok = application:set_env(
[
{opentelemetry, [{start_default_tracer, false}]},
{opentelemetry_experimental, [{start_default_metrics, false}]}
],
[{persistent, true}]
).

View File

@ -27,7 +27,6 @@
-export([post_config_update/5]).
-export([update/1]).
-export([add_otel_log_handler/0, remove_otel_log_handler/0]).
-export([stop_all_otel_apps/0]).
-export([otel_exporter/1]).
update(Config) ->
@ -59,7 +58,6 @@ post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
MetricsRes = ensure_otel_metrics(New, Old),
LogsRes = ensure_otel_logs(New, Old),
TracesRes = ensure_otel_traces(New, Old),
_ = maybe_stop_all_otel_apps(New),
case {MetricsRes, LogsRes, TracesRes} of
{ok, ok, ok} -> ok;
Other -> {error, Other}
@ -67,9 +65,6 @@ post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
post_config_update(_ConfPath, _Req, _NewConf, _OldConf, _AppEnvs) ->
ok.
stop_all_otel_apps() ->
stop_all_otel_apps(true).
add_otel_log_handler() ->
ensure_otel_logs(emqx:get_config(?OPTL), #{}).
@ -104,7 +99,6 @@ ensure_otel_logs(#{logs := LogsConf}, #{logs := LogsConf}) ->
ok;
ensure_otel_logs(#{logs := #{enable := true} = LogsConf}, _OldConf) ->
ok = remove_handler_if_present(?OTEL_LOG_HANDLER_ID),
ok = ensure_log_apps(),
HandlerConf = tr_handler_conf(LogsConf),
%% NOTE: should primary logger level be updated if it's higher than otel log level?
logger:add_handler(?OTEL_LOG_HANDLER_ID, ?OTEL_LOG_HANDLER, HandlerConf);
@ -126,21 +120,6 @@ remove_handler_if_present(HandlerId) ->
ok
end.
ensure_log_apps() ->
{ok, _} = application:ensure_all_started(opentelemetry_exporter),
{ok, _} = application:ensure_all_started(opentelemetry_experimental),
ok.
maybe_stop_all_otel_apps(#{
metrics := #{enable := false},
logs := #{enable := false},
traces := #{enable := false}
}) ->
IsShutdown = false,
stop_all_otel_apps(IsShutdown);
maybe_stop_all_otel_apps(_) ->
ok.
tr_handler_conf(Conf) ->
#{
level := Level,
@ -171,18 +150,3 @@ is_ssl(<<"https://", _/binary>> = _Endpoint) ->
true;
is_ssl(_Endpoint) ->
false.
stop_all_otel_apps(IsShutdown) ->
%% if traces were enabled, it's not safe to stop opentelemetry app,
%% as there could be not finsihed traces that would crash if spans ETS tables are deleted
_ =
case IsShutdown of
true ->
_ = application:stop(opentelemetry);
false ->
ok
end,
_ = application:stop(opentelemetry_experimental),
_ = application:stop(opentelemetry_experimental_api),
_ = application:stop(opentelemetry_exporter),
ok.

View File

@ -74,10 +74,6 @@ setup(_Conf) ->
ensure_apps(Conf) ->
#{exporter := #{interval := ExporterInterval} = Exporter} = Conf,
{ok, _} = application:ensure_all_started(opentelemetry_exporter),
{ok, _} = application:ensure_all_started(opentelemetry),
{ok, _} = application:ensure_all_started(opentelemetry_experimental),
{ok, _} = application:ensure_all_started(opentelemetry_api_experimental),
_ = opentelemetry_experimental:stop_default_metrics(),
ok = application:set_env(

View File

@ -78,7 +78,6 @@ start(Conf) ->
],
set_trace_all(TraceAll),
ok = application:set_env([{opentelemetry, OtelEnv}]),
_ = application:ensure_all_started(opentelemetry),
Res = assert_started(opentelemetry:start_default_tracer_provider()),
case Res of
ok ->