fix(emqx_opentelemetry): avoid using `application:ensure_all_started/1` for better deadlock safety

This commit is contained in:
Serge Tupchii 2023-11-23 13:35:21 +02:00
parent 8d3f98eff2
commit 2a3f6b749c
9 changed files with 23 additions and 65 deletions

View File

@ -25,12 +25,7 @@
redbug, redbug,
xmerl, xmerl,
{hocon, load}, {hocon, load},
telemetry, telemetry
{opentelemetry, load},
{opentelemetry_api, load},
{opentelemetry_experimental, load},
{opentelemetry_api_experimental, load},
{opentelemetry_exporter, load}
], ],
%% must always be of type `load' %% must always be of type `load'
common_business_apps => common_business_apps =>

View File

@ -50,6 +50,7 @@ start() ->
start_sysmon(), start_sysmon(),
configure_shard_transports(), configure_shard_transports(),
set_mnesia_extra_diagnostic_checks(), set_mnesia_extra_diagnostic_checks(),
emqx_otel_app:configure_otel_deps(),
ekka:start(), ekka:start(),
ok. ok.

View File

@ -69,9 +69,7 @@ stop_apps() ->
?SLOG(notice, #{msg => "stopping_emqx_apps"}), ?SLOG(notice, #{msg => "stopping_emqx_apps"}),
_ = emqx_alarm_handler:unload(), _ = emqx_alarm_handler:unload(),
ok = emqx_conf_app:unset_config_loaded(), ok = emqx_conf_app:unset_config_loaded(),
lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())), lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())).
%% Mute otel deps application.
ok = emqx_otel_app:stop_deps().
%% Those port apps are terminated after the main apps %% Those port apps are terminated after the main apps
%% Don't need to stop when reboot. %% Don't need to stop when reboot.

View File

@ -3,13 +3,13 @@
{deps, {deps,
[{emqx, {path, "../emqx"}} [{emqx, {path, "../emqx"}}
%% trace %% trace
, {opentelemetry_api, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_api"}} , {opentelemetry_api, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_api"}}
, {opentelemetry, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry"}} , {opentelemetry, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry"}}
%% log metrics %% logs, metrics
, {opentelemetry_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_experimental"}} , {opentelemetry_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_experimental"}}
, {opentelemetry_api_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_api_experimental"}} , {opentelemetry_api_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_api_experimental"}}
%% export %% export
, {opentelemetry_exporter, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_exporter"}} , {opentelemetry_exporter, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_exporter"}}
]}. ]}.
{edoc_opts, [{preprocess, true}]}. {edoc_opts, [{preprocess, true}]}.

View File

@ -8,14 +8,12 @@
stdlib, stdlib,
emqx, emqx,
%% otel metrics depend on emqx_mgmt_cache %% otel metrics depend on emqx_mgmt_cache
emqx_management emqx_management,
]}, opentelemetry_exporter,
{included_applications, [
opentelemetry, opentelemetry,
opentelemetry_api,
opentelemetry_api_experimental,
opentelemetry_experimental, opentelemetry_experimental,
opentelemetry_exporter opentelemetry_api,
opentelemetry_api_experimental
]}, ]},
{env, []}, {env, []},
{modules, []}, {modules, []},

View File

@ -19,7 +19,7 @@
-behaviour(application). -behaviour(application).
-export([start/2, stop/1]). -export([start/2, stop/1]).
-export([stop_deps/0]). -export([configure_otel_deps/0]).
start(_StartType, _StartArgs) -> start(_StartType, _StartArgs) ->
emqx_otel_config:add_handler(), emqx_otel_config:add_handler(),
@ -33,5 +33,12 @@ stop(_State) ->
_ = emqx_otel_config:remove_otel_log_handler(), _ = emqx_otel_config:remove_otel_log_handler(),
ok. ok.
stop_deps() -> configure_otel_deps() ->
emqx_otel_config:stop_all_otel_apps(). %% default tracer and metrics are started only on demand
ok = application:set_env(
[
{opentelemetry, [{start_default_tracer, false}]},
{opentelemetry_experimental, [{start_default_metrics, false}]}
],
[{persistent, true}]
).

View File

@ -27,7 +27,6 @@
-export([post_config_update/5]). -export([post_config_update/5]).
-export([update/1]). -export([update/1]).
-export([add_otel_log_handler/0, remove_otel_log_handler/0]). -export([add_otel_log_handler/0, remove_otel_log_handler/0]).
-export([stop_all_otel_apps/0]).
-export([otel_exporter/1]). -export([otel_exporter/1]).
update(Config) -> update(Config) ->
@ -59,7 +58,6 @@ post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
MetricsRes = ensure_otel_metrics(New, Old), MetricsRes = ensure_otel_metrics(New, Old),
LogsRes = ensure_otel_logs(New, Old), LogsRes = ensure_otel_logs(New, Old),
TracesRes = ensure_otel_traces(New, Old), TracesRes = ensure_otel_traces(New, Old),
_ = maybe_stop_all_otel_apps(New),
case {MetricsRes, LogsRes, TracesRes} of case {MetricsRes, LogsRes, TracesRes} of
{ok, ok, ok} -> ok; {ok, ok, ok} -> ok;
Other -> {error, Other} Other -> {error, Other}
@ -67,9 +65,6 @@ post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
post_config_update(_ConfPath, _Req, _NewConf, _OldConf, _AppEnvs) -> post_config_update(_ConfPath, _Req, _NewConf, _OldConf, _AppEnvs) ->
ok. ok.
stop_all_otel_apps() ->
stop_all_otel_apps(true).
add_otel_log_handler() -> add_otel_log_handler() ->
ensure_otel_logs(emqx:get_config(?OPTL), #{}). ensure_otel_logs(emqx:get_config(?OPTL), #{}).
@ -104,7 +99,6 @@ ensure_otel_logs(#{logs := LogsConf}, #{logs := LogsConf}) ->
ok; ok;
ensure_otel_logs(#{logs := #{enable := true} = LogsConf}, _OldConf) -> ensure_otel_logs(#{logs := #{enable := true} = LogsConf}, _OldConf) ->
ok = remove_handler_if_present(?OTEL_LOG_HANDLER_ID), ok = remove_handler_if_present(?OTEL_LOG_HANDLER_ID),
ok = ensure_log_apps(),
HandlerConf = tr_handler_conf(LogsConf), HandlerConf = tr_handler_conf(LogsConf),
%% NOTE: should primary logger level be updated if it's higher than otel log level? %% NOTE: should primary logger level be updated if it's higher than otel log level?
logger:add_handler(?OTEL_LOG_HANDLER_ID, ?OTEL_LOG_HANDLER, HandlerConf); logger:add_handler(?OTEL_LOG_HANDLER_ID, ?OTEL_LOG_HANDLER, HandlerConf);
@ -126,21 +120,6 @@ remove_handler_if_present(HandlerId) ->
ok ok
end. end.
ensure_log_apps() ->
{ok, _} = application:ensure_all_started(opentelemetry_exporter),
{ok, _} = application:ensure_all_started(opentelemetry_experimental),
ok.
maybe_stop_all_otel_apps(#{
metrics := #{enable := false},
logs := #{enable := false},
traces := #{enable := false}
}) ->
IsShutdown = false,
stop_all_otel_apps(IsShutdown);
maybe_stop_all_otel_apps(_) ->
ok.
tr_handler_conf(Conf) -> tr_handler_conf(Conf) ->
#{ #{
level := Level, level := Level,
@ -171,18 +150,3 @@ is_ssl(<<"https://", _/binary>> = _Endpoint) ->
true; true;
is_ssl(_Endpoint) -> is_ssl(_Endpoint) ->
false. false.
stop_all_otel_apps(IsShutdown) ->
%% if traces were enabled, it's not safe to stop opentelemetry app,
%% as there could be not finsihed traces that would crash if spans ETS tables are deleted
_ =
case IsShutdown of
true ->
_ = application:stop(opentelemetry);
false ->
ok
end,
_ = application:stop(opentelemetry_experimental),
_ = application:stop(opentelemetry_experimental_api),
_ = application:stop(opentelemetry_exporter),
ok.

View File

@ -74,10 +74,6 @@ setup(_Conf) ->
ensure_apps(Conf) -> ensure_apps(Conf) ->
#{exporter := #{interval := ExporterInterval} = Exporter} = Conf, #{exporter := #{interval := ExporterInterval} = Exporter} = Conf,
{ok, _} = application:ensure_all_started(opentelemetry_exporter),
{ok, _} = application:ensure_all_started(opentelemetry),
{ok, _} = application:ensure_all_started(opentelemetry_experimental),
{ok, _} = application:ensure_all_started(opentelemetry_api_experimental),
_ = opentelemetry_experimental:stop_default_metrics(), _ = opentelemetry_experimental:stop_default_metrics(),
ok = application:set_env( ok = application:set_env(

View File

@ -78,7 +78,6 @@ start(Conf) ->
], ],
set_trace_all(TraceAll), set_trace_all(TraceAll),
ok = application:set_env([{opentelemetry, OtelEnv}]), ok = application:set_env([{opentelemetry, OtelEnv}]),
_ = application:ensure_all_started(opentelemetry),
Res = assert_started(opentelemetry:start_default_tracer_provider()), Res = assert_started(opentelemetry:start_default_tracer_provider()),
case Res of case Res of
ok -> ok ->