From 4bc80ee48386ed7300ad35a6a03221301fd234a0 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Wed, 15 May 2024 18:21:00 +0800 Subject: [PATCH 01/55] build: unsupported otp23 and otp24 casued `maybe_expr` introduced --- scripts/ensure-rebar3.sh | 11 +---------- scripts/get-otp-vsn.sh | 2 +- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/scripts/ensure-rebar3.sh b/scripts/ensure-rebar3.sh index 054deabd4..14b6ad7c7 100755 --- a/scripts/ensure-rebar3.sh +++ b/scripts/ensure-rebar3.sh @@ -4,17 +4,8 @@ set -euo pipefail [ "${DEBUG:-0}" -eq 1 ] && set -x -## rebar3 tag 3.19.0-emqx-1 is compiled using latest official OTP-24 image. -## we have to use an otp24-compiled rebar3 because the defination of record #application{} -## in systools.hrl is changed in otp24. OTP_VSN="${OTP_VSN:-$(./scripts/get-otp-vsn.sh)}" case ${OTP_VSN} in - 23*) - VERSION="3.16.1-emqx-1" - ;; - 24*) - VERSION="3.18.0-emqx-1" - ;; 25*) VERSION="3.19.0-emqx-9" ;; @@ -22,7 +13,7 @@ case ${OTP_VSN} in VERSION="3.20.0-emqx-1" ;; *) - echo "Unsupporetd Erlang/OTP version $OTP_VSN" + echo "Unsupported Erlang/OTP version $OTP_VSN" exit 1 ;; esac diff --git a/scripts/get-otp-vsn.sh b/scripts/get-otp-vsn.sh index 699a16f3f..d21e2c830 100755 --- a/scripts/get-otp-vsn.sh +++ b/scripts/get-otp-vsn.sh @@ -2,4 +2,4 @@ set -euo pipefail -erl -noshell -eval '{ok, Version} = file:read_file(filename:join([code:root_dir(), "releases", erlang:system_info(otp_release), "OTP_VERSION"])), io:fwrite(Version), halt().' +erl -noshell -eval '{ok, Version} = file:read_file(filename:join([code:root_dir(), "releases", erlang:system_info(otp_release), "OTP_VERSION"])), io:fwrite(Version), halt().' From e8d4e8811825ac03ca930bc73295aa576b31117e Mon Sep 17 00:00:00 2001 From: JimMoen Date: Thu, 16 May 2024 17:31:02 +0800 Subject: [PATCH 02/55] fix(plugin): serde not found on new-joined nodes --- apps/emqx_plugins/src/emqx_plugins_app.erl | 2 +- apps/emqx_plugins/src/emqx_plugins_serde.erl | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins_app.erl b/apps/emqx_plugins/src/emqx_plugins_app.erl index a8b2fd6d6..740bcdd22 100644 --- a/apps/emqx_plugins/src/emqx_plugins_app.erl +++ b/apps/emqx_plugins/src/emqx_plugins_app.erl @@ -27,8 +27,8 @@ start(_Type, _Args) -> %% load all pre-configured - ok = emqx_plugins:ensure_started(), {ok, Sup} = emqx_plugins_sup:start_link(), + ok = emqx_plugins:ensure_started(), ok = emqx_config_handler:add_handler([?CONF_ROOT], emqx_plugins), {ok, Sup}. diff --git a/apps/emqx_plugins/src/emqx_plugins_serde.erl b/apps/emqx_plugins/src/emqx_plugins_serde.erl index b50258b9c..fdc46e661 100644 --- a/apps/emqx_plugins/src/emqx_plugins_serde.erl +++ b/apps/emqx_plugins/src/emqx_plugins_serde.erl @@ -33,7 +33,6 @@ init/1, handle_call/3, handle_cast/2, - handle_continue/2, terminate/2 ]). @@ -126,11 +125,10 @@ init(_) -> ]), State = #{}, AvscPaths = get_plugin_avscs(), - {ok, State, {continue, {build_serdes, AvscPaths}}}. - -handle_continue({build_serdes, AvscPaths}, State) -> + %% force build all schemas at startup + %% otherwise plugin schema may not be available when needed _ = build_serdes(AvscPaths), - {noreply, State}. + {ok, State}. handle_call({build_serdes, NameVsn, AvscPath}, _From, State) -> BuildRes = do_build_serde({NameVsn, AvscPath}), From e0e4517d9e1b1eae0bf3e527278a5ca3e31bf220 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Fri, 17 May 2024 09:10:10 +0800 Subject: [PATCH 03/55] fix: ensure plugin config on boot --- .../src/emqx_mgmt_api_plugins.erl | 4 +- apps/emqx_plugins/src/emqx_plugins.erl | 46 +++++++++++++++++-- apps/emqx_plugins/src/emqx_plugins_serde.erl | 4 +- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl index 94b16320a..bda053dee 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl @@ -599,9 +599,9 @@ ensure_action(Name, restart) -> ok. %% for RPC plugin avro encoded config update -do_update_plugin_config(Name, AvroJsonMap, PluginConfigMap) -> +do_update_plugin_config(NameVsn, AvroJsonMap, PluginConfigMap) -> %% TODO: maybe use `PluginConfigMap` to validate config - emqx_plugins:put_config(Name, AvroJsonMap, PluginConfigMap). + emqx_plugins:put_config(NameVsn, AvroJsonMap, PluginConfigMap). %%-------------------------------------------------------------------- %% Helper functions diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 108ef1386..dc8be43fa 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -659,7 +659,8 @@ ensure_exists_and_installed(NameVsn) -> case get_tar(NameVsn) of {ok, TarContent} -> ok = file:write_file(pkg_file_path(NameVsn), TarContent), - ok = do_ensure_installed(NameVsn); + ok = do_ensure_installed(NameVsn), + ok = ensure_avro_config(NameVsn); _ -> %% If not, try to get it from the cluster. do_get_from_cluster(NameVsn) @@ -1046,6 +1047,7 @@ maybe_create_config_dir(NameVsn) -> ConfigDir = plugin_config_dir(NameVsn), case filelib:ensure_path(ConfigDir) of ok -> + _ = maybe_copy_default_avro_config(NameVsn), ok; {error, Reason} -> ?SLOG(warning, #{ @@ -1056,6 +1058,37 @@ maybe_create_config_dir(NameVsn) -> {error, {mkdir_failed, ConfigDir, Reason}} end. +maybe_copy_default_avro_config(NameVsn) -> + Source = default_avro_config_file(NameVsn), + Destination = avro_config_file(NameVsn), + filelib:is_regular(Source) andalso + case file:copy(Source, Destination) of + {ok, _} -> + ok, + ensure_avro_config(NameVsn); + {error, Reason} -> + ?SLOG(warning, #{ + msg => "failed_to_copy_plugin_default_avro_config", + source => Source, + destination => Destination, + reason => Reason + }) + end. + +%% ensure_avro_config() -> +%% ok = for_plugins(fun(NameVsn) -> ensure_avro_config(NameVsn) end). + +ensure_avro_config(NameVsn) -> + case read_plugin_avro(NameVsn, #{read_mode => ?JSON_MAP}) of + {ok, AvroJsonMap} -> + {ok, Config} = decode_plugin_avro_config( + NameVsn, emqx_utils_json:encode(AvroJsonMap) + ), + put_config(NameVsn, AvroJsonMap, Config); + _ -> + ok + end. + %% @private Backup the current config to a file with a timestamp suffix and %% then save the new config to the config file. backup_and_write_avro_bin(NameVsn, AvroBin) -> @@ -1148,7 +1181,7 @@ plugin_dir(NameVsn) -> -spec plugin_config_dir(name_vsn()) -> string(). plugin_config_dir(NameVsn) -> - wrap_list_path(filename:join([plugin_dir(NameVsn), "data", "configs"])). + wrap_list_path(filename:join([emqx:data_dir(), "plugins", NameVsn])). %% Files -spec pkg_file_path(name_vsn()) -> string(). @@ -1161,15 +1194,20 @@ info_file_path(NameVsn) -> -spec avsc_file_path(name_vsn()) -> string(). avsc_file_path(NameVsn) -> - wrap_list_path(filename:join([plugin_dir(NameVsn), "config_schema.avsc"])). + wrap_list_path(filename:join([plugin_dir(NameVsn), "config", "config_schema.avsc"])). -spec avro_config_file(name_vsn()) -> string(). avro_config_file(NameVsn) -> wrap_list_path(filename:join([plugin_config_dir(NameVsn), "config.avro"])). +%% should only used when plugin installing +-spec default_avro_config_file(name_vsn()) -> string(). +default_avro_config_file(NameVsn) -> + wrap_list_path(filename:join([plugin_dir(NameVsn), "config", "config.avro"])). + -spec i18n_file_path(name_vsn()) -> string(). i18n_file_path(NameVsn) -> - wrap_list_path(filename:join([plugin_dir(NameVsn), "config_i18n.json"])). + wrap_list_path(filename:join([plugin_dir(NameVsn), "config", "config_i18n.json"])). -spec readme_file(name_vsn()) -> string(). readme_file(NameVsn) -> diff --git a/apps/emqx_plugins/src/emqx_plugins_serde.erl b/apps/emqx_plugins/src/emqx_plugins_serde.erl index fdc46e661..7328c33ff 100644 --- a/apps/emqx_plugins/src/emqx_plugins_serde.erl +++ b/apps/emqx_plugins/src/emqx_plugins_serde.erl @@ -151,10 +151,10 @@ terminate(_Reason, _State) -> -spec get_plugin_avscs() -> [{string(), string()}]. get_plugin_avscs() -> - Pattern = filename:join([emqx_plugins:install_dir(), "*", "config_schema.avsc"]), + Pattern = filename:join([emqx_plugins:install_dir(), "*", "config", "config_schema.avsc"]), lists:foldl( fun(AvscPath, AccIn) -> - [_, NameVsn | _] = lists:reverse(filename:split(AvscPath)), + [_, _, NameVsn | _] = lists:reverse(filename:split(AvscPath)), [{to_bin(NameVsn), AvscPath} | AccIn] end, _Acc0 = [], From df7dcb2764227e9ade893804919d9ab09db87a64 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Fri, 17 May 2024 14:16:10 +0800 Subject: [PATCH 04/55] fix: do not let plugin start failed lead emqx start failed --- apps/emqx_plugins/src/emqx_plugins.erl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index dc8be43fa..39354f756 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -1013,8 +1013,11 @@ configured() -> for_plugins(ActionFun) -> case lists:flatmap(fun(I) -> for_plugin(I, ActionFun) end, configured()) of - [] -> ok; - Errors -> erlang:error(#{function => ActionFun, errors => Errors}) + [] -> + ok; + Errors -> + ?SLOG(error, #{function => ActionFun, errors => Errors}), + ok end. for_plugin(#{name_vsn := NameVsn, enable := true}, Fun) -> From a7f2f95318971f010ee288c5c334dbbc0a7dc402 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Fri, 17 May 2024 15:48:21 +0800 Subject: [PATCH 05/55] fix: ensure avro file --- apps/emqx_plugins/src/emqx_plugins.erl | 135 ++++++++++++------ .../src/proto/emqx_plugins_proto_v1.erl | 4 + .../src/proto/emqx_plugins_proto_v2.erl | 39 +++++ 3 files changed, 134 insertions(+), 44 deletions(-) create mode 100644 apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 39354f756..1f98b2bd4 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -67,7 +67,8 @@ -export([ decode_plugin_avro_config/2, install_dir/0, - avsc_file_path/1 + avsc_file_path/1, + with_plugin_avsc/1 ]). %% `emqx_config_handler' API @@ -332,6 +333,15 @@ decode_plugin_avro_config(NameVsn, AvroJsonBin) -> {error, ReasonMap} -> {error, ReasonMap} end. +-spec with_plugin_avsc(name_vsn()) -> boolean(). +with_plugin_avsc(NameVsn) -> + case read_plugin_info(NameVsn, #{fill_readme => false}) of + {ok, #{<<"with_config_schema">> := WithAvsc}} when is_boolean(WithAvsc) -> + WithAvsc; + _ -> + false + end. + get_config_interal(Key, Default) when is_atom(Key) -> get_config_interal([Key], Default); get_config_interal(Path, Default) -> @@ -438,7 +448,6 @@ do_ensure_installed(NameVsn) -> ok = write_tar_file_content(install_dir(), TarContent), case read_plugin_info(NameVsn, #{}) of {ok, _} -> - ok = maybe_post_op_after_install(NameVsn), ok; {error, Reason} -> ?SLOG(warning, Reason#{msg => "failed_to_read_after_install"}), @@ -572,6 +581,8 @@ do_ensure_started(NameVsn) -> case ensure_exists_and_installed(NameVsn) of ok -> Plugin = do_read_plugin(NameVsn), + %% ok = ensure_avro_config(NameVsn); + ok = maybe_post_op_after_install(NameVsn), ok = load_code_start_apps(NameVsn, Plugin); {error, plugin_not_found} -> ?SLOG(error, #{ @@ -611,16 +622,16 @@ tryit(WhichOp, F) -> read_plugin_info(NameVsn, Options) -> tryit( atom_to_list(?FUNCTION_NAME), - fun() -> {ok, do_read_plugin2(NameVsn, Options)} end + fun() -> {ok, do_read_plugin(NameVsn, Options)} end ). do_read_plugin(NameVsn) -> - do_read_plugin2(NameVsn, #{}). + do_read_plugin(NameVsn, #{}). -do_read_plugin2(NameVsn, Option) -> - do_read_plugin3(NameVsn, info_file_path(NameVsn), Option). +do_read_plugin(NameVsn, Option) -> + do_read_plugin(NameVsn, info_file_path(NameVsn), Option). -do_read_plugin3(NameVsn, InfoFilePath, Options) -> +do_read_plugin(NameVsn, InfoFilePath, Options) -> {ok, PlainMap} = (read_file_fun(InfoFilePath, "bad_info_file", #{read_mode => ?JSON_MAP}))(), Info0 = check_plugin(PlainMap, NameVsn, InfoFilePath), Info1 = plugins_readme(NameVsn, Options, Info0), @@ -659,8 +670,7 @@ ensure_exists_and_installed(NameVsn) -> case get_tar(NameVsn) of {ok, TarContent} -> ok = file:write_file(pkg_file_path(NameVsn), TarContent), - ok = do_ensure_installed(NameVsn), - ok = ensure_avro_config(NameVsn); + ok = do_ensure_installed(NameVsn); _ -> %% If not, try to get it from the cluster. do_get_from_cluster(NameVsn) @@ -698,6 +708,16 @@ get_from_any_node([Node | T], NameVsn, Errors) -> get_from_any_node(T, NameVsn, [{Node, Err} | Errors]) end. +get_config_from_any_node([], _NameVsn, Errors) -> + {error, Errors}; +get_config_from_any_node([Node | T], NameVsn, Errors) -> + case emqx_plugins_proto_v2:get_config(Node, NameVsn, 5_000) of + {ok, _} = Res -> + Res; + Err -> + get_from_any_node(T, NameVsn, [{Node, Err} | Errors]) + end. + plugins_readme(NameVsn, #{fill_readme := true}, Info) -> case file:read_file(readme_file(NameVsn)) of {ok, Bin} -> Info#{readme => Bin}; @@ -1031,13 +1051,15 @@ for_plugin(#{name_vsn := NameVsn, enable := false}, _Fun) -> maybe_post_op_after_install(NameVsn) -> _ = maybe_load_config_schema(NameVsn), - _ = maybe_create_config_dir(NameVsn), ok. maybe_load_config_schema(NameVsn) -> AvscPath = avsc_file_path(NameVsn), - filelib:is_regular(AvscPath) andalso - do_load_config_schema(NameVsn, AvscPath). + _ = + with_plugin_avsc(NameVsn) andalso + filelib:is_regular(AvscPath) andalso + do_load_config_schema(NameVsn, AvscPath), + _ = maybe_create_config_dir(NameVsn). do_load_config_schema(NameVsn, AvscPath) -> case emqx_plugins_serde:add_schema(NameVsn, AvscPath) of @@ -1047,39 +1069,54 @@ do_load_config_schema(NameVsn, AvscPath) -> end. maybe_create_config_dir(NameVsn) -> - ConfigDir = plugin_config_dir(NameVsn), - case filelib:ensure_path(ConfigDir) of - ok -> - _ = maybe_copy_default_avro_config(NameVsn), - ok; + with_plugin_avsc(NameVsn) andalso + do_create_config_dir(NameVsn). + +do_create_config_dir(NameVsn) -> + case plugin_config_dir(NameVsn) of {error, Reason} -> - ?SLOG(warning, #{ - msg => "failed_to_create_plugin_config_dir", - dir => ConfigDir, - reason => Reason - }), - {error, {mkdir_failed, ConfigDir, Reason}} + {error, {gen_config_dir_failed, Reason}}; + ConfigDir -> + case filelib:ensure_path(ConfigDir) of + ok -> + %% get config from other nodes or get from tarball + _ = maybe_ensure_plugin_config(NameVsn), + ok; + {error, Reason} -> + ?SLOG(warning, #{ + msg => "failed_to_create_plugin_config_dir", + dir => ConfigDir, + reason => Reason + }), + {error, {mkdir_failed, ConfigDir, Reason}} + end end. -maybe_copy_default_avro_config(NameVsn) -> - Source = default_avro_config_file(NameVsn), - Destination = avro_config_file(NameVsn), - filelib:is_regular(Source) andalso - case file:copy(Source, Destination) of - {ok, _} -> - ok, - ensure_avro_config(NameVsn); - {error, Reason} -> - ?SLOG(warning, #{ - msg => "failed_to_copy_plugin_default_avro_config", - source => Source, - destination => Destination, - reason => Reason - }) - end. - -%% ensure_avro_config() -> -%% ok = for_plugins(fun(NameVsn) -> ensure_avro_config(NameVsn) end). +maybe_ensure_plugin_config(NameVsn) -> + Nodes = [N || N <- mria:running_nodes(), N /= node()], + case get_config_from_any_node(Nodes, NameVsn, []) of + {ok, AvroBin} -> + ok = file:write_file(avro_config_file(NameVsn), AvroBin), + ensure_avro_config(NameVsn); + _ -> + %% always copy default avro file into config dir + %% when can not get config from other nodes + Source = default_avro_config_file(NameVsn), + Destination = avro_config_file(NameVsn), + filelib:is_regular(Source) andalso + case file:copy(Source, Destination) of + {ok, _} -> + ok, + ensure_avro_config(NameVsn); + {error, Reason} -> + ?SLOG(warning, #{ + msg => "failed_to_copy_plugin_default_avro_config", + source => Source, + destination => Destination, + reason => Reason + }) + end + end. ensure_avro_config(NameVsn) -> case read_plugin_avro(NameVsn, #{read_mode => ?JSON_MAP}) of @@ -1182,9 +1219,19 @@ read_file_fun(Path, ErrMsg, #{read_mode := ?JSON_MAP}) -> plugin_dir(NameVsn) -> wrap_list_path(filename:join([install_dir(), NameVsn])). --spec plugin_config_dir(name_vsn()) -> string(). +-spec plugin_config_dir(name_vsn()) -> string() | {error, Reason :: string()}. plugin_config_dir(NameVsn) -> - wrap_list_path(filename:join([emqx:data_dir(), "plugins", NameVsn])). + case parse_name_vsn(NameVsn) of + {ok, NameAtom, _Vsn} -> + wrap_list_path(filename:join([emqx:data_dir(), "plugins", atom_to_list(NameAtom)])); + {error, Reason} -> + ?SLOG(warning, #{ + msg => "failed_to_generate_plugin_config_dir_for_plugin", + plugin_namevsn => NameVsn, + reason => Reason + }), + {error, Reason} + end. %% Files -spec pkg_file_path(name_vsn()) -> string(). diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl index bc1e31473..04e06337b 100644 --- a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl @@ -20,6 +20,7 @@ -export([ introduced_in/0, + deprecated_since/0, get_tar/3 ]). @@ -30,6 +31,9 @@ introduced_in() -> "5.0.21". +deprecated_since() -> + "5.7.0". + -spec get_tar(node(), name_vsn(), timeout()) -> {ok, binary()} | {error, any}. get_tar(Node, NameVsn, Timeout) -> rpc:call(Node, emqx_plugins, get_tar, [NameVsn], Timeout). diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl new file mode 100644 index 000000000..480082e28 --- /dev/null +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl @@ -0,0 +1,39 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2024 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_plugins_proto_v2). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + get_tar/3, + get_config/3 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +-type name_vsn() :: binary() | string(). + +introduced_in() -> + "5.7.0". + +-spec get_tar(node(), name_vsn(), timeout()) -> {ok, binary()} | {error, any}. +get_tar(Node, NameVsn, Timeout) -> + rpc:call(Node, emqx_plugins, get_tar, [NameVsn], Timeout). + +get_config(Node, NameVsn, Timeout) -> + rpc:call(Node, emqx_plugins, get_config, [NameVsn], Timeout). From 140b7ce51ed636b902d3e13d64d64bcc50c3c326 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Mon, 20 May 2024 11:06:02 +0800 Subject: [PATCH 06/55] fix(plugin): schema content only provided in enterprise edition --- apps/emqx_management/src/emqx_mgmt_api_plugins.erl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl index bda053dee..5835b9120 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl @@ -695,10 +695,15 @@ aggregate_status([{Node, Plugins} | List], Acc) -> aggregate_status(List, NewAcc). format_plugin_avsc_and_i18n(NameVsn) -> - #{ - avsc => try_read_file(fun() -> emqx_plugins:plugin_avsc(NameVsn) end), - i18n => try_read_file(fun() -> emqx_plugins:plugin_i18n(NameVsn) end) - }. + case emqx_release:edition() of + ee -> + #{ + avsc => try_read_file(fun() -> emqx_plugins:plugin_avsc(NameVsn) end), + i18n => try_read_file(fun() -> emqx_plugins:plugin_i18n(NameVsn) end) + }; + ce -> + #{avsc => null, i18n => null} + end. try_read_file(Fun) -> case Fun() of From 2076681e7608c2e989289c132495fadeaca099dd Mon Sep 17 00:00:00 2001 From: JimMoen Date: Fri, 17 May 2024 17:11:19 +0800 Subject: [PATCH 07/55] fix: put plugin config with binary namevsn key --- apps/emqx_plugins/include/emqx_plugins.hrl | 2 + apps/emqx_plugins/src/emqx_plugins.appup.src | 8 -- apps/emqx_plugins/src/emqx_plugins.erl | 82 ++++++++++++------- .../src/proto/emqx_plugins_proto_v2.erl | 6 +- 4 files changed, 59 insertions(+), 39 deletions(-) delete mode 100644 apps/emqx_plugins/src/emqx_plugins.appup.src diff --git a/apps/emqx_plugins/include/emqx_plugins.hrl b/apps/emqx_plugins/include/emqx_plugins.hrl index f822b9c8d..d1411884f 100644 --- a/apps/emqx_plugins/include/emqx_plugins.hrl +++ b/apps/emqx_plugins/include/emqx_plugins.hrl @@ -24,6 +24,8 @@ -define(CONFIG_FORMAT_AVRO, config_format_avro). -define(CONFIG_FORMAT_MAP, config_format_map). +-define(plugin_conf_not_found, plugin_conf_not_found). + -type schema_name() :: binary(). -type avsc_path() :: string(). diff --git a/apps/emqx_plugins/src/emqx_plugins.appup.src b/apps/emqx_plugins/src/emqx_plugins.appup.src deleted file mode 100644 index f9474dd33..000000000 --- a/apps/emqx_plugins/src/emqx_plugins.appup.src +++ /dev/null @@ -1,8 +0,0 @@ -%% -*- mode: erlang -*- -{"0.1.0", - [ {<<".*">>, []} - ], - [ - {<<".*">>, []} - ] -}. diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 1f98b2bd4..156edce0b 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -80,7 +80,10 @@ -export([get_tar/1]). %% Internal export --export([do_ensure_started/1]). +-export([ + ensure_avro_config/1, + do_ensure_started/1 +]). %% for test cases -export([put_config_internal/2]). @@ -150,7 +153,8 @@ ensure_installed(NameVsn) -> ok; {error, _} -> ok = purge(NameVsn), - do_ensure_installed(NameVsn) + do_ensure_installed(NameVsn), + ok = maybe_post_op_after_install(NameVsn) end. %% @doc Ensure files and directories for the given plugin are being deleted. @@ -288,9 +292,13 @@ get_config(NameVsn, #{format := ?CONFIG_FORMAT_MAP}, Default) -> %% @doc Update plugin's config. %% RPC call from Management API or CLI. %% the avro Json Map and plugin config ALWAYS be valid before calling this function. +put_config(NameVsn, AvroJsonMap, DecodedPluginConfig) when not is_binary(NameVsn) -> + put_config(bin(NameVsn), AvroJsonMap, DecodedPluginConfig); put_config(NameVsn, AvroJsonMap, _DecodedPluginConfig) -> AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), ok = backup_and_write_avro_bin(NameVsn, AvroJsonBin), + %% {ok, AppName, AppVsn} = parse_name_vsn(AppNameVsn), + %% ok = PluginModule:on_config_changed(NameVsn, AvroJsonMap), ok = persistent_term:put(?PLUGIN_PERSIS_CONFIG_KEY(NameVsn), AvroJsonMap), ok. @@ -581,8 +589,6 @@ do_ensure_started(NameVsn) -> case ensure_exists_and_installed(NameVsn) of ok -> Plugin = do_read_plugin(NameVsn), - %% ok = ensure_avro_config(NameVsn); - ok = maybe_post_op_after_install(NameVsn), ok = load_code_start_apps(NameVsn, Plugin); {error, plugin_not_found} -> ?SLOG(error, #{ @@ -614,7 +620,11 @@ tryit(WhichOp, F) -> exception => Reason, stacktrace => Stacktrace }), - {error, {failed, WhichOp}} + {error, #{ + which_op => WhichOp, + exception => Reason, + stacktrace => Stacktrace + }} end. %% read plugin info from the JSON file @@ -708,10 +718,14 @@ get_from_any_node([Node | T], NameVsn, Errors) -> get_from_any_node(T, NameVsn, [{Node, Err} | Errors]) end. -get_config_from_any_node([], _NameVsn, Errors) -> +get_avro_config_from_any_node([], _NameVsn, Errors) -> {error, Errors}; -get_config_from_any_node([Node | T], NameVsn, Errors) -> - case emqx_plugins_proto_v2:get_config(Node, NameVsn, 5_000) of +get_avro_config_from_any_node([Node | T], NameVsn, Errors) -> + case + emqx_plugins_proto_v2:get_config( + Node, NameVsn, #{format => ?CONFIG_FORMAT_MAP}, ?plugin_conf_not_found, 5_000 + ) + of {ok, _} = Res -> Res; Err -> @@ -1041,6 +1055,8 @@ for_plugins(ActionFun) -> end. for_plugin(#{name_vsn := NameVsn, enable := true}, Fun) -> + %% always ensure the plugin is installed + ok = ensure_avro_config(NameVsn), case Fun(NameVsn) of ok -> []; {error, Reason} -> [{NameVsn, Reason}] @@ -1094,31 +1110,41 @@ do_create_config_dir(NameVsn) -> maybe_ensure_plugin_config(NameVsn) -> Nodes = [N || N <- mria:running_nodes(), N /= node()], - case get_config_from_any_node(Nodes, NameVsn, []) of - {ok, AvroBin} -> - ok = file:write_file(avro_config_file(NameVsn), AvroBin), + case get_avro_config_from_any_node(Nodes, NameVsn, []) of + {ok, AvroJsonMap} when is_map(AvroJsonMap) -> + AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), + ok = file:write_file(avro_config_file(NameVsn), AvroJsonBin), ensure_avro_config(NameVsn); + %% {ok, ?plugin_conf_not_found} -> _ -> - %% always copy default avro file into config dir - %% when can not get config from other nodes - Source = default_avro_config_file(NameVsn), - Destination = avro_config_file(NameVsn), - filelib:is_regular(Source) andalso - case file:copy(Source, Destination) of - {ok, _} -> - ok, - ensure_avro_config(NameVsn); - {error, Reason} -> - ?SLOG(warning, #{ - msg => "failed_to_copy_plugin_default_avro_config", - source => Source, - destination => Destination, - reason => Reason - }) - end + cp_default_avro_file(NameVsn), + ensure_avro_config(NameVsn) end. +cp_default_avro_file(NameVsn) -> + %% always copy default avro file into config dir + %% when can not get config from other nodes + Source = default_avro_config_file(NameVsn), + Destination = avro_config_file(NameVsn), + filelib:is_regular(Source) andalso + case file:copy(Source, Destination) of + {ok, _} -> + ok, + ensure_avro_config(NameVsn); + {error, Reason} -> + ?SLOG(warning, #{ + msg => "failed_to_copy_plugin_default_avro_config", + source => Source, + destination => Destination, + reason => Reason + }) + end. + ensure_avro_config(NameVsn) -> + with_plugin_avsc(NameVsn) andalso + do_ensure_avro_config(NameVsn). + +do_ensure_avro_config(NameVsn) -> case read_plugin_avro(NameVsn, #{read_mode => ?JSON_MAP}) of {ok, AvroJsonMap} -> {ok, Config} = decode_plugin_avro_config( diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl index 480082e28..01d0a5ce1 100644 --- a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl @@ -21,7 +21,7 @@ -export([ introduced_in/0, get_tar/3, - get_config/3 + get_config/5 ]). -include_lib("emqx/include/bpapi.hrl"). @@ -35,5 +35,5 @@ introduced_in() -> get_tar(Node, NameVsn, Timeout) -> rpc:call(Node, emqx_plugins, get_tar, [NameVsn], Timeout). -get_config(Node, NameVsn, Timeout) -> - rpc:call(Node, emqx_plugins, get_config, [NameVsn], Timeout). +get_config(Node, NameVsn, Opts, Default, Timeout) -> + rpc:call(Node, emqx_plugins, get_config, [NameVsn, Opts, Default], Timeout). From 677352e49875a64c06369a6afa8c304ac07e49e4 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Mon, 20 May 2024 15:38:54 +0800 Subject: [PATCH 08/55] fix: ensure installed and plugin state on boot --- apps/emqx_plugins/src/emqx_plugins.erl | 44 +++++++++++++++------- apps/emqx_plugins/src/emqx_plugins_app.erl | 1 + 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 156edce0b..81dc90c2e 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -34,6 +34,7 @@ %% Package operations -export([ + ensure_installed/0, ensure_installed/1, ensure_uninstalled/1, ensure_enabled/1, @@ -145,6 +146,15 @@ make_name_vsn_string(Name, Vsn) -> %%-------------------------------------------------------------------- %% Package operations +%% @doc Start all configured plugins are started. +-spec ensure_installed() -> ok. +ensure_installed() -> + Fun = fun(#{name_vsn := NameVsn}) -> + ensure_installed(NameVsn), + [] + end, + ok = for_plugins(Fun). + %% @doc Install a .tar.gz package placed in install_dir. -spec ensure_installed(name_vsn()) -> ok | {error, map()}. ensure_installed(NameVsn) -> @@ -235,7 +245,17 @@ delete_package(NameVsn) -> %% @doc Start all configured plugins are started. -spec ensure_started() -> ok. ensure_started() -> - ok = for_plugins(fun ?MODULE:do_ensure_started/1). + Fun = fun + (#{name_vsn := NameVsn, enable := true}) -> + case do_ensure_started(NameVsn) of + ok -> []; + {error, Reason} -> [{NameVsn, Reason}] + end; + (#{name_vsn := NameVsn, enable := false}) -> + ?SLOG(debug, #{msg => "plugin_disabled", name_vsn => NameVsn}), + [] + end, + ok = for_plugins(Fun). %% @doc Start a plugin from Management API or CLI. %% the input is a - string. @@ -252,7 +272,11 @@ ensure_started(NameVsn) -> %% @doc Stop all plugins before broker stops. -spec ensure_stopped() -> ok. ensure_stopped() -> - for_plugins(fun ?MODULE:ensure_stopped/1). + Fun = fun(#{name_vsn := NameVsn, enable := true}) -> + ensure_stopped(NameVsn), + [] + end, + ok = for_plugins(Fun). %% @doc Stop a plugin from Management API or CLI. -spec ensure_stopped(name_vsn()) -> ok | {error, term()}. @@ -297,6 +321,8 @@ put_config(NameVsn, AvroJsonMap, DecodedPluginConfig) when not is_binary(NameVsn put_config(NameVsn, AvroJsonMap, _DecodedPluginConfig) -> AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), ok = backup_and_write_avro_bin(NameVsn, AvroJsonBin), + %% TODO: callback in plugin's on_config_changed (config update by mgmt API) + %% TODO: callback in plugin's on_config_upgraded (config vsn upgrade v1 -> v2) %% {ok, AppName, AppVsn} = parse_name_vsn(AppNameVsn), %% ok = PluginModule:on_config_changed(NameVsn, AvroJsonMap), ok = persistent_term:put(?PLUGIN_PERSIS_CONFIG_KEY(NameVsn), AvroJsonMap), @@ -1046,7 +1072,7 @@ configured() -> get_config_interal(states, []). for_plugins(ActionFun) -> - case lists:flatmap(fun(I) -> for_plugin(I, ActionFun) end, configured()) of + case lists:flatmap(ActionFun, configured()) of [] -> ok; Errors -> @@ -1054,18 +1080,8 @@ for_plugins(ActionFun) -> ok end. -for_plugin(#{name_vsn := NameVsn, enable := true}, Fun) -> - %% always ensure the plugin is installed - ok = ensure_avro_config(NameVsn), - case Fun(NameVsn) of - ok -> []; - {error, Reason} -> [{NameVsn, Reason}] - end; -for_plugin(#{name_vsn := NameVsn, enable := false}, _Fun) -> - ?SLOG(debug, #{msg => "plugin_disabled", name_vsn => NameVsn}), - []. - maybe_post_op_after_install(NameVsn) -> + _ = ensure_state(NameVsn, _Position = no_move, _Enabled = false, _ConfLocation = global), _ = maybe_load_config_schema(NameVsn), ok. diff --git a/apps/emqx_plugins/src/emqx_plugins_app.erl b/apps/emqx_plugins/src/emqx_plugins_app.erl index 740bcdd22..10e483c22 100644 --- a/apps/emqx_plugins/src/emqx_plugins_app.erl +++ b/apps/emqx_plugins/src/emqx_plugins_app.erl @@ -28,6 +28,7 @@ start(_Type, _Args) -> %% load all pre-configured {ok, Sup} = emqx_plugins_sup:start_link(), + ok = emqx_plugins:ensure_installed(), ok = emqx_plugins:ensure_started(), ok = emqx_config_handler:add_handler([?CONF_ROOT], emqx_plugins), {ok, Sup}. From 1abc8cf5021557dcc33ff091e0f54160a6594991 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Mon, 20 May 2024 16:04:29 +0800 Subject: [PATCH 09/55] fix: ensure plugin's config on boot and join cluster --- apps/emqx_plugins/src/emqx_plugins.erl | 38 ++++++++++++++++++++------ 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 81dc90c2e..ab295e082 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -16,6 +16,8 @@ -module(emqx_plugins). +-feature(maybe_expr, enable). + -include_lib("emqx/include/logger.hrl"). -include("emqx_plugins.hrl"). @@ -160,11 +162,16 @@ ensure_installed() -> ensure_installed(NameVsn) -> case read_plugin_info(NameVsn, #{}) of {ok, _} -> - ok; + ok, + _ = maybe_ensure_plugin_config(NameVsn); {error, _} -> ok = purge(NameVsn), - do_ensure_installed(NameVsn), - ok = maybe_post_op_after_install(NameVsn) + case do_ensure_installed(NameVsn) of + ok -> + maybe_post_op_after_installed(NameVsn); + _ -> + ok + end end. %% @doc Ensure files and directories for the given plugin are being deleted. @@ -1080,9 +1087,9 @@ for_plugins(ActionFun) -> ok end. -maybe_post_op_after_install(NameVsn) -> - _ = ensure_state(NameVsn, _Position = no_move, _Enabled = false, _ConfLocation = global), +maybe_post_op_after_installed(NameVsn) -> _ = maybe_load_config_schema(NameVsn), + _ = ensure_state(NameVsn, no_move, false, global), ok. maybe_load_config_schema(NameVsn) -> @@ -1125,6 +1132,14 @@ do_create_config_dir(NameVsn) -> end. maybe_ensure_plugin_config(NameVsn) -> + maybe + true ?= filelib:is_regular(avro_config_file(NameVsn)), + ensure_avro_config(NameVsn) + else + _ -> do_ensure_plugin_config(NameVsn) + end. + +do_ensure_plugin_config(NameVsn) -> Nodes = [N || N <- mria:running_nodes(), N /= node()], case get_avro_config_from_any_node(Nodes, NameVsn, []) of {ok, AvroJsonMap} when is_map(AvroJsonMap) -> @@ -1142,11 +1157,13 @@ cp_default_avro_file(NameVsn) -> %% when can not get config from other nodes Source = default_avro_config_file(NameVsn), Destination = avro_config_file(NameVsn), - filelib:is_regular(Source) andalso + maybe + true ?= filelib:is_regular(Source), + %% destination path not existed (not configured) + true ?= (not filelib:is_regular(Destination)), case file:copy(Source, Destination) of {ok, _} -> - ok, - ensure_avro_config(NameVsn); + ok; {error, Reason} -> ?SLOG(warning, #{ msg => "failed_to_copy_plugin_default_avro_config", @@ -1154,7 +1171,10 @@ cp_default_avro_file(NameVsn) -> destination => Destination, reason => Reason }) - end. + end + else + _ -> ensure_avro_config(NameVsn) + end. ensure_avro_config(NameVsn) -> with_plugin_avsc(NameVsn) andalso From 3ce091e9d775ef9c9fb0ecd705853774688f4881 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Mon, 20 May 2024 16:31:11 +0800 Subject: [PATCH 10/55] fix: ensure plugin tarball and extracted --- apps/emqx_plugins/src/emqx_plugins.erl | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index ab295e082..aa693286f 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -166,7 +166,7 @@ ensure_installed(NameVsn) -> _ = maybe_ensure_plugin_config(NameVsn); {error, _} -> ok = purge(NameVsn), - case do_ensure_installed(NameVsn) of + case ensure_exists_and_installed(NameVsn) of ok -> maybe_post_op_after_installed(NameVsn); _ -> @@ -303,7 +303,7 @@ get_config(Name, Vsn, Options, Default) -> {ok, plugin_config()} | {error, term()}. get_config(NameVsn) -> - get_config(bin(NameVsn), #{format => ?CONFIG_FORMAT_MAP}). + get_config(NameVsn, #{format => ?CONFIG_FORMAT_MAP}). -spec get_config(name_vsn(), Options :: map()) -> {ok, avro_binary() | plugin_config()} @@ -317,8 +317,9 @@ get_config(NameVsn, #{format := ?CONFIG_FORMAT_AVRO}) -> get_config(NameVsn, Options = #{format := ?CONFIG_FORMAT_MAP}) -> get_config(NameVsn, Options, #{}). +%% Present default config value only in map format. get_config(NameVsn, #{format := ?CONFIG_FORMAT_MAP}, Default) -> - {ok, persistent_term:get(?PLUGIN_PERSIS_CONFIG_KEY(NameVsn), Default)}. + {ok, persistent_term:get(?PLUGIN_PERSIS_CONFIG_KEY(bin(NameVsn)), Default)}. %% @doc Update plugin's config. %% RPC call from Management API or CLI. @@ -1133,21 +1134,24 @@ do_create_config_dir(NameVsn) -> maybe_ensure_plugin_config(NameVsn) -> maybe - true ?= filelib:is_regular(avro_config_file(NameVsn)), - ensure_avro_config(NameVsn) + true ?= with_plugin_avsc(NameVsn), + _ = do_ensure_plugin_config(NameVsn) else - _ -> do_ensure_plugin_config(NameVsn) + _ -> ok end. do_ensure_plugin_config(NameVsn) -> + %% fetch plugin avro config from cluster Nodes = [N || N <- mria:running_nodes(), N /= node()], case get_avro_config_from_any_node(Nodes, NameVsn, []) of {ok, AvroJsonMap} when is_map(AvroJsonMap) -> AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), ok = file:write_file(avro_config_file(NameVsn), AvroJsonBin), ensure_avro_config(NameVsn); - %% {ok, ?plugin_conf_not_found} -> _ -> + ?SLOG(warning, #{msg => "config_not_found_from_cluster"}), + %% otherwise cp default avro file + %% i.e. Clean installation cp_default_avro_file(NameVsn), ensure_avro_config(NameVsn) end. From 87b3b214b98c29ceb399fc9d24e78e06d4a7ecac Mon Sep 17 00:00:00 2001 From: JimMoen Date: Mon, 20 May 2024 18:46:41 +0800 Subject: [PATCH 11/55] fix: make static_check happy --- apps/emqx/priv/bpapi.versions | 1 + .../src/emqx_mgmt_api_plugins.erl | 23 +++++++++------- apps/emqx_plugins/src/emqx_plugins.erl | 26 +++++++++++-------- .../src/proto/emqx_plugins_proto_v1.erl | 4 --- .../src/proto/emqx_plugins_proto_v2.erl | 1 + 5 files changed, 30 insertions(+), 25 deletions(-) diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index 10d27fc63..7c78d43d9 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -64,6 +64,7 @@ {emqx_node_rebalance_status,2}. {emqx_persistent_session_ds,1}. {emqx_plugins,1}. +{emqx_plugins,2}. {emqx_prometheus,1}. {emqx_prometheus,2}. {emqx_resource,1}. diff --git a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl index 5835b9120..fdfbd864a 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl @@ -21,6 +21,8 @@ -include_lib("emqx/include/logger.hrl"). -include_lib("emqx_plugins/include/emqx_plugins.hrl"). +-dialyzer({no_match, [format_plugin_avsc_and_i18n/1]}). + -export([ api_spec/0, fields/1, @@ -534,7 +536,7 @@ update_boot_order(post, #{bindings := #{name := Name}, body := Body}) -> {error, Reason} -> {400, #{code => 'BAD_POSITION', message => Reason}}; Position -> - case emqx_plugins:ensure_enabled(Name, Position, _ConfLocation = global) of + case emqx_plugins:ensure_enabled(Name, Position, global) of ok -> {204}; {error, Reason} -> @@ -694,16 +696,12 @@ aggregate_status([{Node, Plugins} | List], Acc) -> ), aggregate_status(List, NewAcc). +-if(?EMQX_RELEASE_EDITION == ee). format_plugin_avsc_and_i18n(NameVsn) -> - case emqx_release:edition() of - ee -> - #{ - avsc => try_read_file(fun() -> emqx_plugins:plugin_avsc(NameVsn) end), - i18n => try_read_file(fun() -> emqx_plugins:plugin_i18n(NameVsn) end) - }; - ce -> - #{avsc => null, i18n => null} - end. + #{ + avsc => try_read_file(fun() -> emqx_plugins:plugin_avsc(NameVsn) end), + i18n => try_read_file(fun() -> emqx_plugins:plugin_i18n(NameVsn) end) + }. try_read_file(Fun) -> case Fun() of @@ -711,6 +709,11 @@ try_read_file(Fun) -> _ -> null end. +-else. +format_plugin_avsc_and_i18n(_NameVsn) -> + #{avsc => null, i18n => null}. +-endif. + % running_status: running loaded, stopped %% config_status: not_configured disable enable plugin_status(#{running_status := running}) -> running; diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index aa693286f..f00b663b6 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -152,8 +152,10 @@ make_name_vsn_string(Name, Vsn) -> -spec ensure_installed() -> ok. ensure_installed() -> Fun = fun(#{name_vsn := NameVsn}) -> - ensure_installed(NameVsn), - [] + case ensure_installed(NameVsn) of + ok -> []; + {error, Reason} -> [{NameVsn, Reason}] + end end, ok = for_plugins(Fun). @@ -169,8 +171,8 @@ ensure_installed(NameVsn) -> case ensure_exists_and_installed(NameVsn) of ok -> maybe_post_op_after_installed(NameVsn); - _ -> - ok + {error, _Reason} = Err -> + Err end end. @@ -280,8 +282,10 @@ ensure_started(NameVsn) -> -spec ensure_stopped() -> ok. ensure_stopped() -> Fun = fun(#{name_vsn := NameVsn, enable := true}) -> - ensure_stopped(NameVsn), - [] + case ensure_stopped(NameVsn) of + ok -> []; + {error, Reason} -> [{NameVsn, Reason}] + end end, ok = for_plugins(Fun). @@ -314,7 +318,7 @@ get_config(NameVsn, #{format := ?CONFIG_FORMAT_AVRO}) -> {ok, _AvroJson} = Res -> Res; {error, _Reason} = Err -> Err end; -get_config(NameVsn, Options = #{format := ?CONFIG_FORMAT_MAP}) -> +get_config(NameVsn, #{format := ?CONFIG_FORMAT_MAP} = Options) -> get_config(NameVsn, Options, #{}). %% Present default config value only in map format. @@ -714,7 +718,7 @@ ensure_exists_and_installed(NameVsn) -> case get_tar(NameVsn) of {ok, TarContent} -> ok = file:write_file(pkg_file_path(NameVsn), TarContent), - ok = do_ensure_installed(NameVsn); + do_ensure_installed(NameVsn); _ -> %% If not, try to get it from the cluster. do_get_from_cluster(NameVsn) @@ -733,13 +737,13 @@ do_get_from_cluster(NameVsn) -> name_vsn => NameVsn, node_errors => NodeErrors }), - {error, plugin_not_found}; + {error, #{reason => not_found, name_vsn => NameVsn}}; {error, _} -> ?SLOG(error, #{ msg => "no_nodes_to_copy_plugin_from", name_vsn => NameVsn }), - {error, plugin_not_found} + {error, #{reason => not_found, name_vsn => NameVsn}} end. get_from_any_node([], _NameVsn, Errors) -> @@ -763,7 +767,7 @@ get_avro_config_from_any_node([Node | T], NameVsn, Errors) -> {ok, _} = Res -> Res; Err -> - get_from_any_node(T, NameVsn, [{Node, Err} | Errors]) + get_avro_config_from_any_node(T, NameVsn, [{Node, Err} | Errors]) end. plugins_readme(NameVsn, #{fill_readme := true}, Info) -> diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl index 04e06337b..bc1e31473 100644 --- a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v1.erl @@ -20,7 +20,6 @@ -export([ introduced_in/0, - deprecated_since/0, get_tar/3 ]). @@ -31,9 +30,6 @@ introduced_in() -> "5.0.21". -deprecated_since() -> - "5.7.0". - -spec get_tar(node(), name_vsn(), timeout()) -> {ok, binary()} | {error, any}. get_tar(Node, NameVsn, Timeout) -> rpc:call(Node, emqx_plugins, get_tar, [NameVsn], Timeout). diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl index 01d0a5ce1..68b8090a3 100644 --- a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl @@ -35,5 +35,6 @@ introduced_in() -> get_tar(Node, NameVsn, Timeout) -> rpc:call(Node, emqx_plugins, get_tar, [NameVsn], Timeout). +-spec get_config(node(), name_vsn(), map(), any(), timeout()) -> {ok, any()} | {error, any()}. get_config(Node, NameVsn, Opts, Default, Timeout) -> rpc:call(Node, emqx_plugins, get_config, [NameVsn, Opts, Default], Timeout). From 5abd23af5ae393374659003bf682ea1b49927e9c Mon Sep 17 00:00:00 2001 From: JimMoen Date: Tue, 21 May 2024 15:13:25 +0800 Subject: [PATCH 12/55] test: plugin refactor --- apps/emqx_plugins/src/emqx_plugins.erl | 48 ++++++++++++------- apps/emqx_plugins/test/emqx_plugins_SUITE.erl | 43 ++++++++++++++--- 2 files changed, 68 insertions(+), 23 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index f00b663b6..da5bca312 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -18,8 +18,9 @@ -feature(maybe_expr, enable). --include_lib("emqx/include/logger.hrl"). -include("emqx_plugins.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("snabbkaffe/include/trace.hrl"). -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -281,11 +282,15 @@ ensure_started(NameVsn) -> %% @doc Stop all plugins before broker stops. -spec ensure_stopped() -> ok. ensure_stopped() -> - Fun = fun(#{name_vsn := NameVsn, enable := true}) -> - case ensure_stopped(NameVsn) of - ok -> []; - {error, Reason} -> [{NameVsn, Reason}] - end + Fun = fun + (#{name_vsn := NameVsn, enable := true}) -> + case ensure_stopped(NameVsn) of + ok -> []; + {error, Reason} -> [{NameVsn, Reason}] + end; + (#{name_vsn := NameVsn, enable := false}) -> + ?SLOG(debug, #{msg => "plugin_disabled", action => stop_plugin, name_vsn => NameVsn}), + [] end, ok = for_plugins(Fun). @@ -732,18 +737,22 @@ do_get_from_cluster(NameVsn) -> ok = file:write_file(pkg_file_path(NameVsn), TarContent), ok = do_ensure_installed(NameVsn); {error, NodeErrors} when Nodes =/= [] -> - ?SLOG(error, #{ - msg => "failed_to_copy_plugin_from_other_nodes", + ErrMeta = #{ + error_msg => "failed_to_copy_plugin_from_other_nodes", name_vsn => NameVsn, - node_errors => NodeErrors - }), - {error, #{reason => not_found, name_vsn => NameVsn}}; + node_errors => NodeErrors, + reason => not_found + }, + ?SLOG(error, ErrMeta), + {error, ErrMeta}; {error, _} -> - ?SLOG(error, #{ - msg => "no_nodes_to_copy_plugin_from", - name_vsn => NameVsn - }), - {error, #{reason => not_found, name_vsn => NameVsn}} + ErrMeta = #{ + error_msg => "no_nodes_to_copy_plugin_from", + name_vsn => NameVsn, + reason => not_found + }, + ?SLOG(error, ErrMeta), + {error, ErrMeta} end. get_from_any_node([], _NameVsn, Errors) -> @@ -1088,7 +1097,12 @@ for_plugins(ActionFun) -> [] -> ok; Errors -> - ?SLOG(error, #{function => ActionFun, errors => Errors}), + ErrMeta = #{function => ActionFun, errors => Errors}, + ?tp( + for_plugins_action_error_occurred, + ErrMeta + ), + ?SLOG(error, ErrMeta), ok end. diff --git a/apps/emqx_plugins/test/emqx_plugins_SUITE.erl b/apps/emqx_plugins/test/emqx_plugins_SUITE.erl index 80f3d7a48..5c8c51f1e 100644 --- a/apps/emqx_plugins/test/emqx_plugins_SUITE.erl +++ b/apps/emqx_plugins/test/emqx_plugins_SUITE.erl @@ -21,6 +21,7 @@ -include_lib("eunit/include/eunit.hrl"). -include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). -define(EMQX_PLUGIN_APP_NAME, my_emqx_plugin). -define(EMQX_PLUGIN_TEMPLATE_RELEASE_NAME, atom_to_list(?EMQX_PLUGIN_APP_NAME)). @@ -273,9 +274,15 @@ t_start_restart_and_stop(Config) -> %% fake enable bar-2 ok = ensure_state(Bar2, rear, true), %% should cause an error - ?assertError( - #{function := _, errors := [_ | _]}, - emqx_plugins:ensure_started() + ?check_trace( + emqx_plugins:ensure_started(), + fun(Trace) -> + ?assertMatch( + [#{function := _, errors := [_ | _]}], + ?of_kind(for_plugins_action_error_occurred, Trace) + ), + ok + end ), %% but demo plugin should still be running assert_app_running(?EMQX_PLUGIN_APP_NAME, true), @@ -337,7 +344,7 @@ t_enable_disable({'end', Config}) -> t_enable_disable(Config) -> NameVsn = proplists:get_value(name_vsn, Config), ok = emqx_plugins:ensure_installed(NameVsn), - ?assertEqual([], emqx_plugins:configured()), + ?assertEqual([#{name_vsn => NameVsn, enable => false}], emqx_plugins:configured()), ok = emqx_plugins:ensure_enabled(NameVsn), ?assertEqual([#{name_vsn => NameVsn, enable => true}], emqx_plugins:configured()), ok = emqx_plugins:ensure_disabled(NameVsn), @@ -379,9 +386,10 @@ t_bad_tar_gz(Config) -> }}, emqx_plugins:ensure_installed("fake-vsn") ), + %% the plugin tarball can not be found on any nodes ?assertMatch( {error, #{ - error_msg := "failed_to_extract_plugin_package", + error_msg := "no_nodes_to_copy_plugin_from", reason := not_found }}, emqx_plugins:ensure_installed("nonexisting") @@ -556,7 +564,7 @@ t_load_config_from_cli({'end', Config}) -> t_load_config_from_cli(Config) when is_list(Config) -> NameVsn = ?config(name_vsn, Config), ok = emqx_plugins:ensure_installed(NameVsn), - ?assertEqual([], emqx_plugins:configured()), + ?assertEqual([#{name_vsn => NameVsn, enable => false}], emqx_plugins:configured()), ok = emqx_plugins:ensure_enabled(NameVsn), ok = emqx_plugins:ensure_started(NameVsn), Params0 = unused, @@ -687,6 +695,14 @@ group_t_copy_plugin_to_a_new_node(Config) -> %% see: emqx_conf_app:init_conf/0 ok = rpc:call(CopyToNode, application, stop, [emqx_plugins]), {ok, _} = rpc:call(CopyToNode, application, ensure_all_started, [emqx_plugins]), + + %% Plugin config should be synced from `CopyFromNode` + %% by application `emqx` and `emqx_conf` + %% FIXME: in test case, we manually do it here + ok = rpc:call(CopyToNode, emqx_plugins, put_config_internal, [[states], CopyFromPluginsState]), + ok = rpc:call(CopyToNode, emqx_plugins, ensure_installed, []), + ok = rpc:call(CopyToNode, emqx_plugins, ensure_started, []), + ?assertMatch( {ok, #{running_status := running, config_status := enabled}}, rpc:call(CopyToNode, emqx_plugins, describe, [NameVsn]) @@ -739,6 +755,16 @@ group_t_copy_plugin_to_a_new_node_single_node(Config) -> ct:pal("~p install_dir:\n ~p", [ CopyToNode, erpc:call(CopyToNode, file, list_dir, [ToInstallDir]) ]), + + %% Plugin config should be synced from `CopyFromNode` + %% by application `emqx` and `emqx_conf` + %% FIXME: in test case, we manually do it here + ok = rpc:call(CopyToNode, emqx_plugins, put_config_internal, [ + [states], [#{enable => true, name_vsn => NameVsn}] + ]), + ok = rpc:call(CopyToNode, emqx_plugins, ensure_installed, []), + ok = rpc:call(CopyToNode, emqx_plugins, ensure_started, []), + ?assertMatch( {ok, #{running_status := running, config_status := enabled}}, rpc:call(CopyToNode, emqx_plugins, describe, [NameVsn]) @@ -785,6 +811,11 @@ group_t_cluster_leave(Config) -> ok = erpc:call(N1, emqx_plugins, ensure_installed, [NameVsn]), ok = erpc:call(N1, emqx_plugins, ensure_started, [NameVsn]), ok = erpc:call(N1, emqx_plugins, ensure_enabled, [NameVsn]), + + ok = erpc:call(N2, emqx_plugins, ensure_installed, [NameVsn]), + ok = erpc:call(N2, emqx_plugins, ensure_started, [NameVsn]), + ok = erpc:call(N2, emqx_plugins, ensure_enabled, [NameVsn]), + Params = unused, %% 2 nodes running ?assertMatch( From 14f2a687996730cffa138fb794b01a7a65a0698f Mon Sep 17 00:00:00 2001 From: JimMoen Date: Wed, 22 May 2024 03:05:31 +0800 Subject: [PATCH 13/55] fix: bpapi spec type --- .../src/emqx_mgmt_api_plugins.erl | 11 +++--- apps/emqx_plugins/src/emqx_plugins.erl | 36 +++++++++++-------- .../src/proto/emqx_plugins_proto_v2.erl | 11 +++--- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl index fdfbd864a..ca0d3ac10 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl @@ -180,6 +180,9 @@ schema("/plugins/:name/config") -> responses => #{ %% avro data, json encoded 200 => hoconsc:mk(binary()), + 400 => emqx_dashboard_swagger:error_codes( + ['BAD_CONFIG'], <<"Plugin Config Not Found">> + ), 404 => emqx_dashboard_swagger:error_codes(['NOT_FOUND'], <<"Plugin Not Found">>) } }, @@ -490,13 +493,13 @@ update_plugin(put, #{bindings := #{name := Name, action := Action}}) -> plugin_config(get, #{bindings := #{name := NameVsn}}) -> case emqx_plugins:describe(NameVsn) of {ok, _} -> - case emqx_plugins:get_config(NameVsn) of - {ok, AvroJson} -> + case emqx_plugins:get_config(NameVsn, ?CONFIG_FORMAT_MAP, ?plugin_conf_not_found) of + {ok, AvroJson} when is_map(AvroJson) -> {200, #{<<"content-type">> => <<"'application/json'">>}, AvroJson}; - {error, _} -> + {ok, ?plugin_conf_not_found} -> {400, #{ code => 'BAD_CONFIG', - message => <<"Failed to get plugin config">> + message => <<"Plugin Config Not Found">> }} end; _ -> diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index da5bca312..11ef113b0 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -305,30 +305,36 @@ ensure_stopped(NameVsn) -> end ). -get_config(Name, Vsn, Options, Default) -> - get_config(make_name_vsn_string(Name, Vsn), Options, Default). +get_config(Name, Vsn, Opt, Default) -> + get_config(make_name_vsn_string(Name, Vsn), Opt, Default). -spec get_config(name_vsn()) -> - {ok, plugin_config()} + {ok, plugin_config() | any()} | {error, term()}. get_config(NameVsn) -> - get_config(NameVsn, #{format => ?CONFIG_FORMAT_MAP}). + get_config(NameVsn, ?CONFIG_FORMAT_MAP, #{}). --spec get_config(name_vsn(), Options :: map()) -> - {ok, avro_binary() | plugin_config()} +-spec get_config(name_vsn(), ?CONFIG_FORMAT_MAP | ?CONFIG_FORMAT_AVRO) -> + {ok, avro_binary() | plugin_config() | any()} | {error, term()}. -get_config(NameVsn, #{format := ?CONFIG_FORMAT_AVRO}) -> +get_config(NameVsn, ?CONFIG_FORMAT_MAP) -> + get_config(NameVsn, ?CONFIG_FORMAT_MAP, #{}); +get_config(NameVsn, ?CONFIG_FORMAT_AVRO) -> + get_config_bin(NameVsn). + +%% Present default config value only in map format. +-spec get_config(name_vsn(), ?CONFIG_FORMAT_MAP, any()) -> + {ok, plugin_config() | any()} + | {error, term()}. +get_config(NameVsn, ?CONFIG_FORMAT_MAP, Default) -> + {ok, persistent_term:get(?PLUGIN_PERSIS_CONFIG_KEY(bin(NameVsn)), Default)}. + +get_config_bin(NameVsn) -> %% no default value when get raw binary config case read_plugin_avro(NameVsn) of {ok, _AvroJson} = Res -> Res; {error, _Reason} = Err -> Err - end; -get_config(NameVsn, #{format := ?CONFIG_FORMAT_MAP} = Options) -> - get_config(NameVsn, Options, #{}). - -%% Present default config value only in map format. -get_config(NameVsn, #{format := ?CONFIG_FORMAT_MAP}, Default) -> - {ok, persistent_term:get(?PLUGIN_PERSIS_CONFIG_KEY(bin(NameVsn)), Default)}. + end. %% @doc Update plugin's config. %% RPC call from Management API or CLI. @@ -770,7 +776,7 @@ get_avro_config_from_any_node([], _NameVsn, Errors) -> get_avro_config_from_any_node([Node | T], NameVsn, Errors) -> case emqx_plugins_proto_v2:get_config( - Node, NameVsn, #{format => ?CONFIG_FORMAT_MAP}, ?plugin_conf_not_found, 5_000 + Node, NameVsn, ?CONFIG_FORMAT_MAP, ?plugin_conf_not_found, 5_000 ) of {ok, _} = Res -> diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl index 68b8090a3..0e933d351 100644 --- a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl @@ -24,6 +24,7 @@ get_config/5 ]). +-include("emqx_plugins.hrl"). -include_lib("emqx/include/bpapi.hrl"). -type name_vsn() :: binary() | string(). @@ -31,10 +32,12 @@ introduced_in() -> "5.7.0". --spec get_tar(node(), name_vsn(), timeout()) -> {ok, binary()} | {error, any}. +-spec get_tar(node(), name_vsn(), timeout()) -> {ok, binary()} | {error, any()}. get_tar(Node, NameVsn, Timeout) -> rpc:call(Node, emqx_plugins, get_tar, [NameVsn], Timeout). --spec get_config(node(), name_vsn(), map(), any(), timeout()) -> {ok, any()} | {error, any()}. -get_config(Node, NameVsn, Opts, Default, Timeout) -> - rpc:call(Node, emqx_plugins, get_config, [NameVsn, Opts, Default], Timeout). +-spec get_config( + node(), name_vsn(), ?CONFIG_FORMAT_MAP, any(), timeout() +) -> {ok, map() | any()} | {error, any()}. +get_config(Node, NameVsn, Opt, Default, Timeout) -> + rpc:call(Node, emqx_plugins, get_config, [NameVsn, Opt, Default], Timeout). From 33aa61daeab671c69a9a570b8339b6c3a21cdda9 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Wed, 22 May 2024 06:37:58 +0800 Subject: [PATCH 14/55] fix: use hocon format as plugin config --- apps/emqx_plugins/src/emqx_plugins.erl | 46 ++++++++++++-------- apps/emqx_plugins/src/emqx_plugins_serde.erl | 4 +- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 11ef113b0..1a8a679c6 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -342,8 +342,8 @@ get_config_bin(NameVsn) -> put_config(NameVsn, AvroJsonMap, DecodedPluginConfig) when not is_binary(NameVsn) -> put_config(bin(NameVsn), AvroJsonMap, DecodedPluginConfig); put_config(NameVsn, AvroJsonMap, _DecodedPluginConfig) -> - AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), - ok = backup_and_write_avro_bin(NameVsn, AvroJsonBin), + HoconBin = hocon_pp:do(AvroJsonMap, #{}), + ok = backup_and_write_avro_bin(NameVsn, HoconBin), %% TODO: callback in plugin's on_config_changed (config update by mgmt API) %% TODO: callback in plugin's on_config_upgraded (config vsn upgrade v1 -> v2) %% {ok, AppName, AppVsn} = parse_name_vsn(AppNameVsn), @@ -717,7 +717,7 @@ read_plugin_avro(NameVsn) -> read_plugin_avro(NameVsn, Options) -> tryit( atom_to_list(?FUNCTION_NAME), - read_file_fun(avro_config_file(NameVsn), "bad_avro_file", Options) + read_file_fun(plugin_config_file(NameVsn), "bad_avro_file", Options) ). ensure_exists_and_installed(NameVsn) -> @@ -1170,21 +1170,21 @@ do_ensure_plugin_config(NameVsn) -> case get_avro_config_from_any_node(Nodes, NameVsn, []) of {ok, AvroJsonMap} when is_map(AvroJsonMap) -> AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), - ok = file:write_file(avro_config_file(NameVsn), AvroJsonBin), + ok = file:write_file(plugin_config_file(NameVsn), AvroJsonBin), ensure_avro_config(NameVsn); _ -> - ?SLOG(warning, #{msg => "config_not_found_from_cluster"}), + ?SLOG(info, #{msg => "config_not_found_from_cluster"}), %% otherwise cp default avro file %% i.e. Clean installation - cp_default_avro_file(NameVsn), + cp_default_config_file(NameVsn), ensure_avro_config(NameVsn) end. -cp_default_avro_file(NameVsn) -> +cp_default_config_file(NameVsn) -> %% always copy default avro file into config dir %% when can not get config from other nodes - Source = default_avro_config_file(NameVsn), - Destination = avro_config_file(NameVsn), + Source = default_plugin_config_file(NameVsn), + Destination = plugin_config_file(NameVsn), maybe true ?= filelib:is_regular(Source), %% destination path not existed (not configured) @@ -1224,7 +1224,7 @@ do_ensure_avro_config(NameVsn) -> backup_and_write_avro_bin(NameVsn, AvroBin) -> %% this may fail, but we don't care %% e.g. read-only file system - Path = avro_config_file(NameVsn), + Path = plugin_config_file(NameVsn), _ = filelib:ensure_dir(Path), TmpFile = Path ++ ".tmp", case file:write_file(TmpFile, AvroBin) of @@ -1309,6 +1309,16 @@ read_file_fun(Path, ErrMsg, #{read_mode := ?JSON_MAP}) -> plugin_dir(NameVsn) -> wrap_list_path(filename:join([install_dir(), NameVsn])). +-spec plugin_priv_dir(name_vsn()) -> string(). +plugin_priv_dir(NameVsn) -> + case read_plugin_info(NameVsn, #{fill_readme => false}) of + {ok, #{<<"name">> := Name, <<"metadata_vsn">> := Vsn}} -> + AppDir = <>, + wrap_list_path(filename:join([plugin_dir(NameVsn), AppDir, "priv"])); + _ -> + wrap_list_path(filename:join([install_dir(), NameVsn, "priv"])) + end. + -spec plugin_config_dir(name_vsn()) -> string() | {error, Reason :: string()}. plugin_config_dir(NameVsn) -> case parse_name_vsn(NameVsn) of @@ -1334,20 +1344,20 @@ info_file_path(NameVsn) -> -spec avsc_file_path(name_vsn()) -> string(). avsc_file_path(NameVsn) -> - wrap_list_path(filename:join([plugin_dir(NameVsn), "config", "config_schema.avsc"])). + wrap_list_path(filename:join([plugin_priv_dir(NameVsn), "config_schema.avsc"])). --spec avro_config_file(name_vsn()) -> string(). -avro_config_file(NameVsn) -> - wrap_list_path(filename:join([plugin_config_dir(NameVsn), "config.avro"])). +-spec plugin_config_file(name_vsn()) -> string(). +plugin_config_file(NameVsn) -> + wrap_list_path(filename:join([plugin_config_dir(NameVsn), "config.hocon"])). %% should only used when plugin installing --spec default_avro_config_file(name_vsn()) -> string(). -default_avro_config_file(NameVsn) -> - wrap_list_path(filename:join([plugin_dir(NameVsn), "config", "config.avro"])). +-spec default_plugin_config_file(name_vsn()) -> string(). +default_plugin_config_file(NameVsn) -> + wrap_list_path(filename:join([plugin_priv_dir(NameVsn), "config.hocon"])). -spec i18n_file_path(name_vsn()) -> string(). i18n_file_path(NameVsn) -> - wrap_list_path(filename:join([plugin_dir(NameVsn), "config", "config_i18n.json"])). + wrap_list_path(filename:join([plugin_priv_dir(NameVsn), "config_i18n.json"])). -spec readme_file(name_vsn()) -> string(). readme_file(NameVsn) -> diff --git a/apps/emqx_plugins/src/emqx_plugins_serde.erl b/apps/emqx_plugins/src/emqx_plugins_serde.erl index 7328c33ff..b76f71991 100644 --- a/apps/emqx_plugins/src/emqx_plugins_serde.erl +++ b/apps/emqx_plugins/src/emqx_plugins_serde.erl @@ -151,10 +151,10 @@ terminate(_Reason, _State) -> -spec get_plugin_avscs() -> [{string(), string()}]. get_plugin_avscs() -> - Pattern = filename:join([emqx_plugins:install_dir(), "*", "config", "config_schema.avsc"]), + Pattern = filename:join([emqx_plugins:install_dir(), "*", "*", "priv", "config_schema.avsc"]), lists:foldl( fun(AvscPath, AccIn) -> - [_, _, NameVsn | _] = lists:reverse(filename:split(AvscPath)), + [_, _, _, NameVsn | _] = lists:reverse(filename:split(AvscPath)), [{to_bin(NameVsn), AvscPath} | AccIn] end, _Acc0 = [], From e5f7aa981738977e1b28d7a0dcf6863ae596b6a4 Mon Sep 17 00:00:00 2001 From: JimMoen Date: Wed, 22 May 2024 11:08:55 +0800 Subject: [PATCH 15/55] refactor: plguin functions and types rename --- .../src/emqx_mgmt_api_plugins.erl | 6 +- apps/emqx_plugins/include/emqx_plugins.hrl | 12 +- apps/emqx_plugins/src/emqx_plugins.erl | 160 +++++++++--------- .../src/proto/emqx_plugins_proto_v2.erl | 2 - 4 files changed, 91 insertions(+), 89 deletions(-) diff --git a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl index ca0d3ac10..ed5f016b3 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_plugins.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_plugins.erl @@ -508,7 +508,7 @@ plugin_config(get, #{bindings := #{name := NameVsn}}) -> plugin_config(put, #{bindings := #{name := NameVsn}, body := AvroJsonMap}) -> case emqx_plugins:describe(NameVsn) of {ok, _} -> - case emqx_plugins:decode_plugin_avro_config(NameVsn, AvroJsonMap) of + case emqx_plugins:decode_plugin_config_map(NameVsn, AvroJsonMap) of {ok, AvroValueConfig} -> Nodes = emqx:running_nodes(), %% cluster call with config in map (binary key-value) @@ -702,8 +702,8 @@ aggregate_status([{Node, Plugins} | List], Acc) -> -if(?EMQX_RELEASE_EDITION == ee). format_plugin_avsc_and_i18n(NameVsn) -> #{ - avsc => try_read_file(fun() -> emqx_plugins:plugin_avsc(NameVsn) end), - i18n => try_read_file(fun() -> emqx_plugins:plugin_i18n(NameVsn) end) + avsc => try_read_file(fun() -> emqx_plugins:plugin_schema_json(NameVsn) end), + i18n => try_read_file(fun() -> emqx_plugins:plugin_i18n_json(NameVsn) end) }. try_read_file(Fun) -> diff --git a/apps/emqx_plugins/include/emqx_plugins.hrl b/apps/emqx_plugins/include/emqx_plugins.hrl index d1411884f..66f23d4b0 100644 --- a/apps/emqx_plugins/include/emqx_plugins.hrl +++ b/apps/emqx_plugins/include/emqx_plugins.hrl @@ -21,7 +21,7 @@ -define(PLUGIN_SERDE_TAB, emqx_plugins_schema_serde_tab). --define(CONFIG_FORMAT_AVRO, config_format_avro). +-define(CONFIG_FORMAT_BIN, config_format_bin). -define(CONFIG_FORMAT_MAP, config_format_map). -define(plugin_conf_not_found, plugin_conf_not_found). @@ -32,6 +32,16 @@ -type encoded_data() :: iodata(). -type decoded_data() :: map(). +%% "my_plugin-0.1.0" +-type name_vsn() :: binary() | string(). +%% the parse result of the JSON info file +-type plugin_info() :: map(). +-type schema_json_map() :: map(). +-type i18n_json_map() :: map(). +-type raw_plugin_config_content() :: binary(). +-type plugin_config_map() :: map(). +-type position() :: no_move | front | rear | {before, name_vsn()} | {behind, name_vsn()}. + -record(plugin_schema_serde, { name :: schema_name(), eval_context :: term(), diff --git a/apps/emqx_plugins/src/emqx_plugins.erl b/apps/emqx_plugins/src/emqx_plugins.erl index 1a8a679c6..ae97e7f5f 100644 --- a/apps/emqx_plugins/src/emqx_plugins.erl +++ b/apps/emqx_plugins/src/emqx_plugins.erl @@ -28,9 +28,9 @@ -export([ describe/1, - plugin_avsc/1, - plugin_i18n/1, - plugin_avro/1, + plugin_schema_json/1, + plugin_i18n_json/1, + raw_plugin_config_content/1, parse_name_vsn/1, make_name_vsn_string/2 ]). @@ -69,7 +69,7 @@ %% Package utils -export([ - decode_plugin_avro_config/2, + decode_plugin_config_map/2, install_dir/0, avsc_file_path/1, with_plugin_avsc/1 @@ -85,7 +85,7 @@ %% Internal export -export([ - ensure_avro_config/1, + ensure_config_map/1, do_ensure_started/1 ]). %% for test cases @@ -104,36 +104,26 @@ -define(MAX_KEEP_BACKUP_CONFIGS, 10). -%% "my_plugin-0.1.0" --type name_vsn() :: binary() | string(). -%% the parse result of the JSON info file --type plugin() :: map(). --type schema_json() :: map(). --type i18n_json() :: map(). --type avro_binary() :: binary(). --type plugin_config() :: map(). --type position() :: no_move | front | rear | {before, name_vsn()} | {behind, name_vsn()}. - %%-------------------------------------------------------------------- %% APIs %%-------------------------------------------------------------------- %% @doc Describe a plugin. --spec describe(name_vsn()) -> {ok, plugin()} | {error, any()}. +-spec describe(name_vsn()) -> {ok, plugin_info()} | {error, any()}. describe(NameVsn) -> read_plugin_info(NameVsn, #{fill_readme => true}). --spec plugin_avsc(name_vsn()) -> {ok, schema_json()} | {error, any()}. -plugin_avsc(NameVsn) -> +-spec plugin_schema_json(name_vsn()) -> {ok, schema_json_map()} | {error, any()}. +plugin_schema_json(NameVsn) -> read_plugin_avsc(NameVsn). --spec plugin_i18n(name_vsn()) -> {ok, i18n_json()} | {error, any()}. -plugin_i18n(NameVsn) -> +-spec plugin_i18n_json(name_vsn()) -> {ok, i18n_json_map()} | {error, any()}. +plugin_i18n_json(NameVsn) -> read_plugin_i18n(NameVsn). --spec plugin_avro(name_vsn()) -> {ok, avro_binary()} | {error, any()}. -plugin_avro(NameVsn) -> - read_plugin_avro(NameVsn). +-spec raw_plugin_config_content(name_vsn()) -> {ok, raw_plugin_config_content()} | {error, any()}. +raw_plugin_config_content(NameVsn) -> + read_plugin_hocon(NameVsn). parse_name_vsn(NameVsn) when is_binary(NameVsn) -> parse_name_vsn(binary_to_list(NameVsn)); @@ -309,46 +299,44 @@ get_config(Name, Vsn, Opt, Default) -> get_config(make_name_vsn_string(Name, Vsn), Opt, Default). -spec get_config(name_vsn()) -> - {ok, plugin_config() | any()} + {ok, plugin_config_map() | any()} | {error, term()}. get_config(NameVsn) -> get_config(NameVsn, ?CONFIG_FORMAT_MAP, #{}). --spec get_config(name_vsn(), ?CONFIG_FORMAT_MAP | ?CONFIG_FORMAT_AVRO) -> - {ok, avro_binary() | plugin_config() | any()} +-spec get_config(name_vsn(), ?CONFIG_FORMAT_MAP | ?CONFIG_FORMAT_BIN) -> + {ok, raw_plugin_config_content() | plugin_config_map() | any()} | {error, term()}. get_config(NameVsn, ?CONFIG_FORMAT_MAP) -> get_config(NameVsn, ?CONFIG_FORMAT_MAP, #{}); -get_config(NameVsn, ?CONFIG_FORMAT_AVRO) -> +get_config(NameVsn, ?CONFIG_FORMAT_BIN) -> get_config_bin(NameVsn). %% Present default config value only in map format. -spec get_config(name_vsn(), ?CONFIG_FORMAT_MAP, any()) -> - {ok, plugin_config() | any()} + {ok, plugin_config_map() | any()} | {error, term()}. get_config(NameVsn, ?CONFIG_FORMAT_MAP, Default) -> {ok, persistent_term:get(?PLUGIN_PERSIS_CONFIG_KEY(bin(NameVsn)), Default)}. get_config_bin(NameVsn) -> %% no default value when get raw binary config - case read_plugin_avro(NameVsn) of - {ok, _AvroJson} = Res -> Res; + case read_plugin_hocon(NameVsn) of + {ok, _Map} = Res -> Res; {error, _Reason} = Err -> Err end. %% @doc Update plugin's config. %% RPC call from Management API or CLI. -%% the avro Json Map and plugin config ALWAYS be valid before calling this function. -put_config(NameVsn, AvroJsonMap, DecodedPluginConfig) when not is_binary(NameVsn) -> - put_config(bin(NameVsn), AvroJsonMap, DecodedPluginConfig); -put_config(NameVsn, AvroJsonMap, _DecodedPluginConfig) -> - HoconBin = hocon_pp:do(AvroJsonMap, #{}), - ok = backup_and_write_avro_bin(NameVsn, HoconBin), +%% the plugin config Json Map and plugin config ALWAYS be valid before calling this function. +put_config(NameVsn, ConfigJsonMap, DecodedPluginConfig) when not is_binary(NameVsn) -> + put_config(bin(NameVsn), ConfigJsonMap, DecodedPluginConfig); +put_config(NameVsn, ConfigJsonMap, _DecodedPluginConfig) -> + HoconBin = hocon_pp:do(ConfigJsonMap, #{}), + ok = backup_and_write_hocon_bin(NameVsn, HoconBin), %% TODO: callback in plugin's on_config_changed (config update by mgmt API) %% TODO: callback in plugin's on_config_upgraded (config vsn upgrade v1 -> v2) - %% {ok, AppName, AppVsn} = parse_name_vsn(AppNameVsn), - %% ok = PluginModule:on_config_changed(NameVsn, AvroJsonMap), - ok = persistent_term:put(?PLUGIN_PERSIS_CONFIG_KEY(NameVsn), AvroJsonMap), + ok = persistent_term:put(?PLUGIN_PERSIS_CONFIG_KEY(NameVsn), ConfigJsonMap), ok. %% @doc Stop and then start the plugin. @@ -360,7 +348,7 @@ restart(NameVsn) -> %% @doc List all installed plugins. %% Including the ones that are installed, but not enabled in config. --spec list() -> [plugin()]. +-spec list() -> [plugin_info()]. list() -> Pattern = filename:join([install_dir(), "*", "release.json"]), All = lists:filtermap( @@ -381,10 +369,10 @@ list() -> %%-------------------------------------------------------------------- %% Package utils --spec decode_plugin_avro_config(name_vsn(), map() | binary()) -> {ok, map()} | {error, any()}. -decode_plugin_avro_config(NameVsn, AvroJsonMap) when is_map(AvroJsonMap) -> - decode_plugin_avro_config(NameVsn, emqx_utils_json:encode(AvroJsonMap)); -decode_plugin_avro_config(NameVsn, AvroJsonBin) -> +-spec decode_plugin_config_map(name_vsn(), map() | binary()) -> {ok, map()} | {error, any()}. +decode_plugin_config_map(NameVsn, AvroJsonMap) when is_map(AvroJsonMap) -> + decode_plugin_config_map(NameVsn, emqx_utils_json:encode(AvroJsonMap)); +decode_plugin_config_map(NameVsn, AvroJsonBin) -> case emqx_plugins_serde:decode(NameVsn, AvroJsonBin) of {ok, Config} -> {ok, Config}; {error, ReasonMap} -> {error, ReasonMap} @@ -712,12 +700,12 @@ read_plugin_i18n(NameVsn, Options) -> read_file_fun(i18n_file_path(NameVsn), "bad_i18n_file", Options) ). -read_plugin_avro(NameVsn) -> - read_plugin_avro(NameVsn, #{read_mode => ?RAW_BIN}). -read_plugin_avro(NameVsn, Options) -> +read_plugin_hocon(NameVsn) -> + read_plugin_hocon(NameVsn, #{read_mode => ?RAW_BIN}). +read_plugin_hocon(NameVsn, Options) -> tryit( atom_to_list(?FUNCTION_NAME), - read_file_fun(plugin_config_file(NameVsn), "bad_avro_file", Options) + read_file_fun(plugin_config_file(NameVsn), "bad_hocon_file", Options) ). ensure_exists_and_installed(NameVsn) -> @@ -738,7 +726,7 @@ ensure_exists_and_installed(NameVsn) -> do_get_from_cluster(NameVsn) -> Nodes = [N || N <- mria:running_nodes(), N /= node()], - case get_from_any_node(Nodes, NameVsn, []) of + case get_plugin_tar_from_any_node(Nodes, NameVsn, []) of {ok, TarContent} -> ok = file:write_file(pkg_file_path(NameVsn), TarContent), ok = do_ensure_installed(NameVsn); @@ -761,19 +749,19 @@ do_get_from_cluster(NameVsn) -> {error, ErrMeta} end. -get_from_any_node([], _NameVsn, Errors) -> +get_plugin_tar_from_any_node([], _NameVsn, Errors) -> {error, Errors}; -get_from_any_node([Node | T], NameVsn, Errors) -> +get_plugin_tar_from_any_node([Node | T], NameVsn, Errors) -> case emqx_plugins_proto_v1:get_tar(Node, NameVsn, infinity) of {ok, _} = Res -> Res; Err -> - get_from_any_node(T, NameVsn, [{Node, Err} | Errors]) + get_plugin_tar_from_any_node(T, NameVsn, [{Node, Err} | Errors]) end. -get_avro_config_from_any_node([], _NameVsn, Errors) -> +get_plugin_config_from_any_node([], _NameVsn, Errors) -> {error, Errors}; -get_avro_config_from_any_node([Node | T], NameVsn, Errors) -> +get_plugin_config_from_any_node([Node | T], NameVsn, Errors) -> case emqx_plugins_proto_v2:get_config( Node, NameVsn, ?CONFIG_FORMAT_MAP, ?plugin_conf_not_found, 5_000 @@ -782,7 +770,7 @@ get_avro_config_from_any_node([Node | T], NameVsn, Errors) -> {ok, _} = Res -> Res; Err -> - get_avro_config_from_any_node(T, NameVsn, [{Node, Err} | Errors]) + get_plugin_config_from_any_node(T, NameVsn, [{Node, Err} | Errors]) end. plugins_readme(NameVsn, #{fill_readme := true}, Info) -> @@ -1159,30 +1147,37 @@ do_create_config_dir(NameVsn) -> maybe_ensure_plugin_config(NameVsn) -> maybe true ?= with_plugin_avsc(NameVsn), - _ = do_ensure_plugin_config(NameVsn) + _ = ensure_plugin_config(NameVsn) else _ -> ok end. -do_ensure_plugin_config(NameVsn) -> - %% fetch plugin avro config from cluster +ensure_plugin_config(NameVsn) -> + %% fetch plugin hocon config from cluster Nodes = [N || N <- mria:running_nodes(), N /= node()], - case get_avro_config_from_any_node(Nodes, NameVsn, []) of - {ok, AvroJsonMap} when is_map(AvroJsonMap) -> - AvroJsonBin = emqx_utils_json:encode(AvroJsonMap), - ok = file:write_file(plugin_config_file(NameVsn), AvroJsonBin), - ensure_avro_config(NameVsn); + ensure_plugin_config(NameVsn, Nodes). +ensure_plugin_config(NameVsn, []) -> + ?SLOG(debug, #{ + msg => "default_plugin_config_used", + name_vsn => NameVsn, + reason => "no_other_running_nodes" + }), + cp_default_config_file(NameVsn); +ensure_plugin_config(NameVsn, Nodes) -> + case get_plugin_config_from_any_node(Nodes, NameVsn, []) of + {ok, ConfigMap} when is_map(ConfigMap) -> + HoconBin = hocon_pp:do(ConfigMap, #{}), + ok = file:write_file(plugin_config_file(NameVsn), HoconBin), + ensure_config_map(NameVsn); _ -> - ?SLOG(info, #{msg => "config_not_found_from_cluster"}), - %% otherwise cp default avro file + ?SLOG(error, #{msg => "config_not_found_from_cluster", name_vsn => NameVsn}), + %% otherwise cp default hocon file %% i.e. Clean installation - cp_default_config_file(NameVsn), - ensure_avro_config(NameVsn) + cp_default_config_file(NameVsn) end. cp_default_config_file(NameVsn) -> - %% always copy default avro file into config dir - %% when can not get config from other nodes + %% always copy default hocon file into config dir when can not get config from other nodes Source = default_plugin_config_file(NameVsn), Destination = plugin_config_file(NameVsn), maybe @@ -1194,40 +1189,39 @@ cp_default_config_file(NameVsn) -> ok; {error, Reason} -> ?SLOG(warning, #{ - msg => "failed_to_copy_plugin_default_avro_config", + msg => "failed_to_copy_plugin_default_hocon_config", source => Source, destination => Destination, reason => Reason }) end else - _ -> ensure_avro_config(NameVsn) + _ -> ensure_config_map(NameVsn) end. -ensure_avro_config(NameVsn) -> +ensure_config_map(NameVsn) -> with_plugin_avsc(NameVsn) andalso - do_ensure_avro_config(NameVsn). + do_ensure_config_map(NameVsn). -do_ensure_avro_config(NameVsn) -> - case read_plugin_avro(NameVsn, #{read_mode => ?JSON_MAP}) of - {ok, AvroJsonMap} -> - {ok, Config} = decode_plugin_avro_config( - NameVsn, emqx_utils_json:encode(AvroJsonMap) - ), - put_config(NameVsn, AvroJsonMap, Config); +do_ensure_config_map(NameVsn) -> + case read_plugin_hocon(NameVsn, #{read_mode => ?JSON_MAP}) of + {ok, ConfigJsonMap} -> + {ok, Config} = decode_plugin_config_map(NameVsn, ConfigJsonMap), + put_config(NameVsn, ConfigJsonMap, Config); _ -> + ?SLOG(warning, #{msg => "failed_to_read_plugin_config_hocon", name_vsn => NameVsn}), ok end. %% @private Backup the current config to a file with a timestamp suffix and %% then save the new config to the config file. -backup_and_write_avro_bin(NameVsn, AvroBin) -> +backup_and_write_hocon_bin(NameVsn, HoconBin) -> %% this may fail, but we don't care %% e.g. read-only file system Path = plugin_config_file(NameVsn), _ = filelib:ensure_dir(Path), TmpFile = Path ++ ".tmp", - case file:write_file(TmpFile, AvroBin) of + case file:write_file(TmpFile, HoconBin) of ok -> backup_and_replace(Path, TmpFile); {error, Reason} -> @@ -1313,7 +1307,7 @@ plugin_dir(NameVsn) -> plugin_priv_dir(NameVsn) -> case read_plugin_info(NameVsn, #{fill_readme => false}) of {ok, #{<<"name">> := Name, <<"metadata_vsn">> := Vsn}} -> - AppDir = <>, + AppDir = make_name_vsn_string(Name, Vsn), wrap_list_path(filename:join([plugin_dir(NameVsn), AppDir, "priv"])); _ -> wrap_list_path(filename:join([install_dir(), NameVsn, "priv"])) diff --git a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl index 0e933d351..fa91a8512 100644 --- a/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl +++ b/apps/emqx_plugins/src/proto/emqx_plugins_proto_v2.erl @@ -27,8 +27,6 @@ -include("emqx_plugins.hrl"). -include_lib("emqx/include/bpapi.hrl"). --type name_vsn() :: binary() | string(). - introduced_in() -> "5.7.0". From f3e8037e0f830d2c42cbf0ca74ec97e8e86e5045 Mon Sep 17 00:00:00 2001 From: firest Date: Wed, 22 May 2024 16:29:38 +0800 Subject: [PATCH 16/55] fix(rocketmq): fix namespace error for RocketMQ --- .../src/emqx_bridge_rocketmq_connector.erl | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl b/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl index 4aeb6e772..5c62fd622 100644 --- a/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl +++ b/apps/emqx_bridge_rocketmq/src/emqx_bridge_rocketmq_connector.erl @@ -112,11 +112,13 @@ on_start( ), ClientId = client_id(InstanceId), ACLInfo = acl_info(AccessKey, SecretKey, SecurityToken), - ClientCfg = namespace(#{acl_info => ACLInfo}, Config), + Namespace = maps:get(namespace, Config, <<>>), + ClientCfg = #{acl_info => ACLInfo, namespace => Namespace}, State = #{ client_id => ClientId, acl_info => ACLInfo, + namespace => Namespace, installed_channels => #{} }, @@ -139,12 +141,13 @@ on_add_channel( _InstId, #{ installed_channels := InstalledChannels, + namespace := Namespace, acl_info := ACLInfo } = OldState, ChannelId, ChannelConfig ) -> - {ok, ChannelState} = create_channel_state(ChannelConfig, ACLInfo), + {ok, ChannelState} = create_channel_state(ChannelConfig, ACLInfo, Namespace), NewInstalledChannels = maps:put(ChannelId, ChannelState, InstalledChannels), %% Update state NewState = OldState#{installed_channels => NewInstalledChannels}, @@ -152,16 +155,18 @@ on_add_channel( create_channel_state( #{parameters := Conf} = _ChannelConfig, - ACLInfo + ACLInfo, + Namespace ) -> #{ topic := Topic, - sync_timeout := SyncTimeout + sync_timeout := SyncTimeout, + strategy := Strategy } = Conf, TopicTks = emqx_placeholder:preproc_tmpl(Topic), - ProducerOpts = make_producer_opts(Conf, ACLInfo), + ProducerOpts = make_producer_opts(Conf, ACLInfo, Namespace, Strategy), Templates = parse_template(Conf), - DispatchStrategy = parse_dispatch_strategy(Conf), + DispatchStrategy = parse_dispatch_strategy(Strategy), State = #{ topic => Topic, topic_tokens => TopicTks, @@ -330,11 +335,11 @@ parse_template([], Templates) -> Templates. %% returns a procedure to generate the produce context -parse_dispatch_strategy(#{strategy := roundrobin}) -> +parse_dispatch_strategy(roundrobin) -> fun(_) -> #{} end; -parse_dispatch_strategy(#{strategy := Template}) -> +parse_dispatch_strategy(Template) -> Tokens = emqx_placeholder:preproc_tmpl(Template), fun(Msg) -> #{ @@ -400,12 +405,20 @@ make_producer_opts( send_buffer := SendBuff, refresh_interval := RefreshInterval }, - ACLInfo + ACLInfo, + Namespace, + Strategy ) -> #{ tcp_opts => [{sndbuf, SendBuff}], ref_topic_route_interval => RefreshInterval, - acl_info => emqx_secret:wrap(ACLInfo) + acl_info => emqx_secret:wrap(ACLInfo), + namespace => Namespace, + partitioner => + case Strategy of + roundrobin -> roundrobin; + _ -> key_dispatch + end }. acl_info(<<>>, _, _) -> @@ -424,10 +437,6 @@ acl_info(AccessKey, SecretKey, SecurityToken) when is_binary(AccessKey) -> acl_info(_, _, _) -> #{}. -namespace(ClientCfg, Config) -> - Namespace = maps:get(namespace, Config, <<>>), - ClientCfg#{namespace => Namespace}. - create_producers_map(ClientId) -> _ = ets:new(ClientId, [public, named_table, {read_concurrency, true}]), ok. From 917474f69438f58973337c8c3ffce92f2691e20f Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Mon, 20 May 2024 16:03:14 +0200 Subject: [PATCH 17/55] fix: action config update would sometimes not be reflected in connector Before this commit the following happened sometimes: 1. action status is connected 2. action config is updated to something that should change the status to disconnected 3. action status is still connected and the old config is being used by the connector even though the config has been correctly updated. The reason for this bug is that the post_config_hook runs before the global EMQX config is updated. The post config hook is adding the new config to the connector. Since the new config causes the action to get status disconnected, the adding of the action to the connector is retried when the health check runs but this time the config will be loaded from the global config which could cause the old config to be loaded (this happens when the global config has not had time to get updated). The above problem has been fixed in this commit by only reading action configs from the global config when the connector starts/restarts and instead store the latest configs for the actions in the connector. Fixes: https://emqx.atlassian.net/browse/EMQX-12376 --- .../emqx_bridge/test/emqx_bridge_v2_SUITE.erl | 54 ++++++ .../src/emqx_resource_manager.erl | 159 ++++++++++-------- 2 files changed, 147 insertions(+), 66 deletions(-) diff --git a/apps/emqx_bridge/test/emqx_bridge_v2_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_v2_SUITE.erl index d6199f3a3..dc2e8f275 100644 --- a/apps/emqx_bridge/test/emqx_bridge_v2_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_v2_SUITE.erl @@ -788,6 +788,60 @@ t_update_connector_not_found(_Config) -> ), ok. +%% Check that https://emqx.atlassian.net/browse/EMQX-12376 is fixed +t_update_concurrent_health_check(_Config) -> + Msg = <<"Channel status check failed">>, + ok = meck:expect( + emqx_bridge_v2_test_connector, + on_get_channel_status, + fun( + _ResId, + ChannelId, + #{channels := Channels} + ) -> + #{ + is_conf_for_connected := Connected + } = maps:get(ChannelId, Channels), + case Connected of + true -> + connected; + false -> + {error, Msg} + end + end + ), + BaseConf = (bridge_config())#{ + is_conf_for_connected => false + }, + ?assertMatch({ok, _}, emqx_bridge_v2:create(bridge_type(), my_test_bridge, BaseConf)), + SetStatusConnected = + fun + (true) -> + Conf = BaseConf#{is_conf_for_connected => true}, + %% Update the config + ?assertMatch({ok, _}, emqx_bridge_v2:create(bridge_type(), my_test_bridge, Conf)), + ?assertMatch( + #{status := connected}, + emqx_bridge_v2:health_check(bridge_type(), my_test_bridge) + ); + (false) -> + Conf = BaseConf#{is_conf_for_connected => false}, + %% Update the config + ?assertMatch({ok, _}, emqx_bridge_v2:create(bridge_type(), my_test_bridge, Conf)), + ?assertMatch( + #{status := disconnected}, + emqx_bridge_v2:health_check(bridge_type(), my_test_bridge) + ) + end, + [ + begin + Connected = (N rem 2) =:= 0, + SetStatusConnected(Connected) + end + || N <- lists:seq(0, 20) + ], + ok. + t_remove_single_connector_being_referenced_with_active_channels(_Config) -> %% we test the connector post config update here because we also need bridges. Conf = bridge_config(), diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 1c0c3f4ca..1efd88b24 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -542,9 +542,9 @@ handle_event(enter, _OldState, ?state_stopped = State, Data) -> {keep_state_and_data, []}; %% The following events can be handled in any other state handle_event( - {call, From}, {add_channel, ChannelId, _Config}, State, Data + {call, From}, {add_channel, ChannelId, Config}, State, Data ) -> - handle_not_connected_add_channel(From, ChannelId, State, Data); + handle_not_connected_add_channel(From, ChannelId, Config, State, Data); handle_event( {call, From}, {remove_channel, ChannelId}, _State, Data ) -> @@ -678,8 +678,8 @@ add_channels(Data) -> Channels = Data#data.added_channels, NewChannels = lists:foldl( fun - ({ChannelID, #{enable := true}}, Acc) -> - maps:put(ChannelID, channel_status(), Acc); + ({ChannelID, #{enable := true} = Config}, Acc) -> + maps:put(ChannelID, channel_status_not_added(Config), Acc); ({_, #{enable := false}}, Acc) -> Acc end, @@ -702,7 +702,7 @@ add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) -> %% we have not yet performed the initial health_check NewAddedChannelsMap = maps:put( ChannelID, - channel_status_new_waiting_for_health_check(), + channel_status_new_waiting_for_health_check(ChannelConfig), AddedChannelsMap ), NewData = Data#data{ @@ -720,7 +720,7 @@ add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) -> AddedChannelsMap = Data#data.added_channels, NewAddedChannelsMap = maps:put( ChannelID, - channel_status(Error), + channel_status(Error, ChannelConfig), AddedChannelsMap ), NewData = Data#data{ @@ -835,7 +835,7 @@ handle_add_channel(From, Data, ChannelId, Config) -> maps:get( ChannelId, Channels, - channel_status() + channel_status_not_added(Config) ) ) of @@ -843,7 +843,7 @@ handle_add_channel(From, Data, ChannelId, Config) -> %% The channel is not installed in the connector state %% We insert it into the channels map and let the health check %% take care of the rest - NewChannels = maps:put(ChannelId, channel_status_new_with_config(Config), Channels), + NewChannels = maps:put(ChannelId, channel_status_not_added(Config), Channels), NewData = Data#data{added_channels = NewChannels}, {keep_state, update_state(NewData, Data), [ {reply, From, ok} @@ -854,17 +854,21 @@ handle_add_channel(From, Data, ChannelId, Config) -> {keep_state_and_data, [{reply, From, ok}]} end. -handle_not_connected_add_channel(From, ChannelId, State, Data) -> +handle_not_connected_add_channel(From, ChannelId, ChannelConfig, State, Data) -> %% When state is not connected the channel will be added to the channels %% map but nothing else will happen. - NewData = add_channel_status_if_not_exists(Data, ChannelId, State), + NewData = add_or_update_channel_status(Data, ChannelId, ChannelConfig, State), {keep_state, update_state(NewData, Data), [{reply, From, ok}]}. handle_remove_channel(From, ChannelId, Data) -> Channels = Data#data.added_channels, %% Deactivate alarm _ = maybe_clear_alarm(ChannelId), - case channel_status_is_channel_added(maps:get(ChannelId, Channels, channel_status())) of + case + channel_status_is_channel_added( + maps:get(ChannelId, Channels, channel_status_not_added(undefined)) + ) + of false -> %% The channel is already not installed in the connector state. %% We still need to remove it from the added_channels map @@ -1033,7 +1037,9 @@ continue_resource_health_check_not_connected(NewStatus, Data0) -> end. handle_manual_channel_health_check(From, #data{state = undefined}, _ChannelId) -> - {keep_state_and_data, [{reply, From, channel_status({error, resource_disconnected})}]}; + {keep_state_and_data, [ + {reply, From, channel_status({error, resource_disconnected}, undefined)} + ]}; handle_manual_channel_health_check( From, #data{ @@ -1072,7 +1078,7 @@ handle_manual_channel_health_check( _Data, _ChannelId ) -> - {keep_state_and_data, [{reply, From, channel_status({error, channel_not_found})}]}. + {keep_state_and_data, [{reply, From, channel_status({error, channel_not_found}, undefined)}]}. -spec channels_health_check(resource_status(), data()) -> data(). channels_health_check(?status_connected = _ConnectorStatus, Data0) -> @@ -1097,14 +1103,14 @@ channels_health_check(?status_connecting = _ConnectorStatus, Data0) -> %% 2. Raise alarms (TODO: if it is a probe we should not raise alarms) Channels = Data0#data.added_channels, ChannelsToChangeStatusFor = [ - ChannelId - || {ChannelId, Status} <- maps:to_list(Channels), + {ChannelId, Config} + || {ChannelId, #{config := Config} = Status} <- maps:to_list(Channels), channel_status_is_channel_added(Status) ], ChannelsWithNewStatuses = [ - {ChannelId, channel_status({?status_connecting, resource_is_connecting})} - || ChannelId <- ChannelsToChangeStatusFor + {ChannelId, channel_status({?status_connecting, resource_is_connecting}, Config)} + || {ChannelId, Config} <- ChannelsToChangeStatusFor ], %% Update the channels map NewChannels = lists:foldl( @@ -1149,9 +1155,10 @@ channels_health_check(ConnectorStatus, Data0) -> ConnectorStatus, ChannelId, Data1 - )} + )}, + Config )} - || {ChannelId, OldStatus} <- maps:to_list(Data1#data.added_channels) + || {ChannelId, #{config := Config} = OldStatus} <- maps:to_list(Data1#data.added_channels) ], %% Raise alarms _ = lists:foreach( @@ -1224,7 +1231,7 @@ continue_channel_health_check_connected(ChannelId, OldStatus, Data0) -> #{channel := CHCWorkers0} = HCWorkers0, CHCWorkers = emqx_utils_maps:deep_remove([ongoing, ChannelId], CHCWorkers0), Data1 = Data0#data{hc_workers = HCWorkers0#{channel := CHCWorkers}}, - %% Remove the added channels with a a status different from connected or connecting + %% Remove the added channels with a status different from connected or connecting NewStatus = maps:get(ChannelId, Data0#data.added_channels), ChannelsToRemove = [ChannelId || not channel_status_is_channel_added(NewStatus)], Data = remove_channels_in_list(ChannelsToRemove, Data1, true), @@ -1253,9 +1260,11 @@ spawn_channel_health_check_worker(#data{} = Data, ChannelId) -> %% separated so it can be spec'ed and placate dialyzer tantrums... -spec worker_channel_health_check(data(), channel_id()) -> no_return(). worker_channel_health_check(Data, ChannelId) -> - #data{id = ResId, mod = Mod, state = State} = Data, + #data{id = ResId, mod = Mod, state = State, added_channels = Channels} = Data, + ChannelStatus = maps:get(ChannelId, Channels, #{}), + ChannelConfig = maps:get(config, ChannelStatus, undefined), RawStatus = emqx_resource:call_channel_health_check(ResId, ChannelId, Mod, State), - exit({ok, channel_status(RawStatus)}). + exit({ok, channel_status(RawStatus, ChannelConfig)}). -spec handle_channel_health_check_worker_down( data(), {pid(), reference()}, {ok, channel_status_map()} @@ -1267,11 +1276,15 @@ handle_channel_health_check_worker_down(Data0, WorkerRef, ExitResult) -> added_channels = AddedChannels0 } = Data0, {ChannelId, CHCWorkers1} = maps:take(WorkerRef, CHCWorkers0), - case ExitResult of - {ok, NewStatus} -> - %% `emqx_resource:call_channel_health_check' catches all exceptions. - AddedChannels = maps:put(ChannelId, NewStatus, AddedChannels0) - end, + %% The channel might have got removed while the health check was going on + CurrentStatus = maps:get(ChannelId, AddedChannels0, channel_not_added), + {AddedChannels, NewStatus} = + handle_channel_health_check_worker_down_new_channels_and_status( + ChannelId, + ExitResult, + CurrentStatus, + AddedChannels0 + ), #{ongoing := Ongoing0} = CHCWorkers1, {PreviousChanStatus, Ongoing1} = maps:take(ChannelId, Ongoing0), CHCWorkers2 = CHCWorkers1#{ongoing := Ongoing1}, @@ -1293,6 +1306,26 @@ handle_channel_health_check_worker_down(Data0, WorkerRef, ExitResult) -> {keep_state, update_state(Data, Data0), Replies} end. +handle_channel_health_check_worker_down_new_channels_and_status( + ChannelId, + {ok, #{config := CheckedConfig} = NewStatus} = _ExitResult, + #{config := CurrentConfig} = _CurrentStatus, + AddedChannels +) when CheckedConfig =:= CurrentConfig -> + %% Checked config is the same as the current config so we can update the + %% status in AddedChannels + {maps:put(ChannelId, NewStatus, AddedChannels), NewStatus}; +handle_channel_health_check_worker_down_new_channels_and_status( + _ChannelId, + {ok, NewStatus} = _ExitResult, + _CurrentStatus, + AddedChannels +) -> + %% The checked config is different from the current config which means we + %% should not update AddedChannels because the channel has been removed or + %% updated while the health check was in progress + {AddedChannels, NewStatus}. + reply_pending_channel_health_check_callers( ChannelId, Status, Data0 = #data{hc_pending_callers = Pending0} ) -> @@ -1469,7 +1502,7 @@ safe_call(ResId, Message, Timeout) -> %% Helper functions for chanel status data -channel_status() -> +channel_status_not_added(ChannelConfig) -> #{ %% The status of the channel. Can be one of the following: %% - disconnected: the channel is not added to the resource (error may contain the reason)) @@ -1479,62 +1512,61 @@ channel_status() -> %% - connected: the channel is added to the resource, the resource is %% connected and the on_channel_get_status callback has returned %% connected. The error field should be undefined. - status => ?status_disconnected, - error => not_added_yet - }. - -%% If the channel is added with add_channel/2, the config field will be set to -%% the config. This is useful when doing probing since the config is not stored -%% anywhere else in that case. -channel_status_new_with_config(Config) -> - #{ status => ?status_disconnected, error => not_added_yet, - config => Config + config => ChannelConfig }. -channel_status_new_waiting_for_health_check() -> +channel_status_new_waiting_for_health_check(ChannelConfig) -> #{ status => ?status_connecting, - error => no_health_check_yet + error => no_health_check_yet, + config => ChannelConfig }. -channel_status({?status_connecting, Error}) -> +channel_status({?status_connecting, Error}, ChannelConfig) -> #{ status => ?status_connecting, - error => Error + error => Error, + config => ChannelConfig }; -channel_status({?status_disconnected, Error}) -> +channel_status({?status_disconnected, Error}, ChannelConfig) -> #{ status => ?status_disconnected, - error => Error + error => Error, + config => ChannelConfig }; -channel_status(?status_disconnected) -> +channel_status(?status_disconnected, ChannelConfig) -> #{ status => ?status_disconnected, - error => <<"Disconnected for unknown reason">> + error => <<"Disconnected for unknown reason">>, + config => ChannelConfig }; -channel_status(?status_connecting) -> +channel_status(?status_connecting, ChannelConfig) -> #{ status => ?status_connecting, - error => <<"Not connected for unknown reason">> + error => <<"Not connected for unknown reason">>, + config => ChannelConfig }; -channel_status(?status_connected) -> +channel_status(?status_connected, ChannelConfig) -> #{ status => ?status_connected, - error => undefined + error => undefined, + config => ChannelConfig }; %% Probably not so useful but it is permitted to set an error even when the %% status is connected -channel_status({?status_connected, Error}) -> +channel_status({?status_connected, Error}, ChannelConfig) -> #{ status => ?status_connected, - error => Error + error => Error, + config => ChannelConfig }; -channel_status({error, Reason}) -> +channel_status({error, Reason}, ChannelConfig) -> #{ status => ?status_disconnected, - error => Reason + error => Reason, + config => ChannelConfig }. channel_status_is_channel_added(#{ @@ -1548,19 +1580,14 @@ channel_status_is_channel_added(#{ channel_status_is_channel_added(_Status) -> false. --spec add_channel_status_if_not_exists(data(), channel_id(), resource_state()) -> data(). -add_channel_status_if_not_exists(Data, ChannelId, State) -> +-spec add_or_update_channel_status(data(), channel_id(), map(), resource_state()) -> data(). +add_or_update_channel_status(Data, ChannelId, ChannelConfig, State) -> Channels = Data#data.added_channels, - case maps:is_key(ChannelId, Channels) of - true -> - Data; - false -> - ChannelStatus = channel_status({error, resource_not_operational}), - NewChannels = maps:put(ChannelId, ChannelStatus, Channels), - ResStatus = state_to_status(State), - maybe_alarm(ResStatus, ChannelId, ChannelStatus, no_prev), - Data#data{added_channels = NewChannels} - end. + ChannelStatus = channel_status({error, resource_not_operational}, ChannelConfig), + NewChannels = maps:put(ChannelId, ChannelStatus, Channels), + ResStatus = state_to_status(State), + maybe_alarm(ResStatus, ChannelId, ChannelStatus, no_prev), + Data#data{added_channels = NewChannels}. state_to_status(?state_stopped) -> ?rm_status_stopped; state_to_status(?state_connected) -> ?status_connected; From 331f9a1b96db9a4534853fa8f60007f4faf19e24 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Mon, 20 May 2024 16:26:20 +0200 Subject: [PATCH 18/55] docs: add change log entry --- changes/ce/fix-13077.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ce/fix-13077.en.md diff --git a/changes/ce/fix-13077.en.md b/changes/ce/fix-13077.en.md new file mode 100644 index 000000000..8082c6c40 --- /dev/null +++ b/changes/ce/fix-13077.en.md @@ -0,0 +1 @@ +Updates to action configurations would sometimes not take effect without disabling and enabling the action. This means that an action could sometimes run with the old (previous) configuration even though it would look like the action configuration has been updated successfully. From 39d758c4d6aacb6bb4a67c4f7f73211bdeeaf29a Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Tue, 21 May 2024 07:15:21 +0200 Subject: [PATCH 19/55] fix: do not return configs for channels from emqx_resource_manager --- .../src/emqx_resource_manager.erl | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 1efd88b24..8bf673b91 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -1038,7 +1038,8 @@ continue_resource_health_check_not_connected(NewStatus, Data0) -> handle_manual_channel_health_check(From, #data{state = undefined}, _ChannelId) -> {keep_state_and_data, [ - {reply, From, channel_status({error, resource_disconnected}, undefined)} + {reply, From, + maps:remove(config, channel_status({error, resource_disconnected}, undefined))} ]}; handle_manual_channel_health_check( From, @@ -1072,13 +1073,15 @@ handle_manual_channel_health_check( is_map_key(ChannelId, Channels) -> %% No ongoing health check: reply with current status. - {keep_state_and_data, [{reply, From, maps:get(ChannelId, Channels)}]}; + {keep_state_and_data, [{reply, From, maps:remove(config, maps:get(ChannelId, Channels))}]}; handle_manual_channel_health_check( From, _Data, _ChannelId ) -> - {keep_state_and_data, [{reply, From, channel_status({error, channel_not_found}, undefined)}]}. + {keep_state_and_data, [ + {reply, From, maps:remove(config, channel_status({error, channel_not_found}, undefined))} + ]}. -spec channels_health_check(resource_status(), data()) -> data(). channels_health_check(?status_connected = _ConnectorStatus, Data0) -> @@ -1327,8 +1330,9 @@ handle_channel_health_check_worker_down_new_channels_and_status( {AddedChannels, NewStatus}. reply_pending_channel_health_check_callers( - ChannelId, Status, Data0 = #data{hc_pending_callers = Pending0} + ChannelId, Status0, Data0 = #data{hc_pending_callers = Pending0} ) -> + Status = maps:remove(config, Status0), #{channel := CPending0} = Pending0, Pending = maps:get(ChannelId, CPending0, []), Actions = [{reply, From, Status} || From <- Pending], @@ -1459,6 +1463,14 @@ maybe_reply(Actions, From, Reply) -> -spec data_record_to_external_map(data()) -> resource_data(). data_record_to_external_map(Data) -> + AddedChannelsList = maps:to_list(Data#data.added_channels), + AddedChannelsListWithoutConfigs = + [ + {ChanID, maps:remove(config, Status)} + || {ChanID, Status} <- AddedChannelsList + ], + AddedChannelsWithoutConfigs = + maps:from_list(AddedChannelsListWithoutConfigs), #{ id => Data#data.id, error => external_error(Data#data.error), @@ -1468,7 +1480,7 @@ data_record_to_external_map(Data) -> config => Data#data.config, status => Data#data.status, state => Data#data.state, - added_channels => Data#data.added_channels + added_channels => AddedChannelsWithoutConfigs }. -spec wait_for_ready(resource_id(), integer()) -> ok | timeout | {error, term()}. From cff8b97e8a56200deaddeed6ef0ff81a7a5c2857 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Wed, 22 May 2024 10:39:19 +0200 Subject: [PATCH 20/55] fix: handle channel updated during health check This commit fixes an issue found by CI test case emqx_bridge_influxdb_SUITE:t_start_stop and others. While the channel health check process is running, the channel could be removed or updated which could cause a crash in the resource manager or non up-to-date alarms being triggered. --- .../src/emqx_resource_manager.erl | 39 ++++++++++++++++--- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 8bf673b91..70dac17ae 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -1228,14 +1228,29 @@ trigger_health_check_for_added_channels(Data0 = #data{hc_workers = HCWorkers0}) start_channel_health_check(Data1, ChannelId) end. --spec continue_channel_health_check_connected(channel_id(), channel_status_map(), data()) -> data(). -continue_channel_health_check_connected(ChannelId, OldStatus, Data0) -> +-spec continue_channel_health_check_connected( + channel_id(), channel_status_map(), channel_status_map(), data() +) -> data(). +continue_channel_health_check_connected(ChannelId, OldStatus, CurrentStatus, Data0) -> #data{hc_workers = HCWorkers0} = Data0, #{channel := CHCWorkers0} = HCWorkers0, CHCWorkers = emqx_utils_maps:deep_remove([ongoing, ChannelId], CHCWorkers0), Data1 = Data0#data{hc_workers = HCWorkers0#{channel := CHCWorkers}}, + case OldStatus =:= CurrentStatus of + true -> + continue_channel_health_check_connected_no_update_during_check( + ChannelId, OldStatus, Data1 + ); + false -> + %% Channel has been updated while the health check process was working so + %% we should not clear any alarm or remove the channel from the + %% connector + Data1 + end. + +continue_channel_health_check_connected_no_update_during_check(ChannelId, OldStatus, Data1) -> %% Remove the added channels with a status different from connected or connecting - NewStatus = maps:get(ChannelId, Data0#data.added_channels), + NewStatus = maps:get(ChannelId, Data1#data.added_channels), ChannelsToRemove = [ChannelId || not channel_status_is_channel_added(NewStatus)], Data = remove_channels_in_list(ChannelsToRemove, Data1, true), %% Raise/clear alarms @@ -1299,13 +1314,23 @@ handle_channel_health_check_worker_down(Data0, WorkerRef, ExitResult) -> CHCWorkers = CHCWorkers3#{pending := Rest}, HCWorkers = HCWorkers0#{channel := CHCWorkers}, Data3 = Data2#data{hc_workers = HCWorkers}, - Data4 = continue_channel_health_check_connected(ChannelId, PreviousChanStatus, Data3), + Data4 = continue_channel_health_check_connected( + ChannelId, + PreviousChanStatus, + CurrentStatus, + Data3 + ), Data = start_channel_health_check(Data4, NextChannelId), {keep_state, update_state(Data, Data0), Replies}; #{pending := []} -> HCWorkers = HCWorkers0#{channel := CHCWorkers3}, Data3 = Data2#data{hc_workers = HCWorkers}, - Data = continue_channel_health_check_connected(ChannelId, PreviousChanStatus, Data3), + Data = continue_channel_health_check_connected( + ChannelId, + PreviousChanStatus, + CurrentStatus, + Data3 + ), {keep_state, update_state(Data, Data0), Replies} end. @@ -1326,7 +1351,9 @@ handle_channel_health_check_worker_down_new_channels_and_status( ) -> %% The checked config is different from the current config which means we %% should not update AddedChannels because the channel has been removed or - %% updated while the health check was in progress + %% updated while the health check was in progress. We can still reply with + %% NewStatus because the health check must have been issued before the + %% configuration changed or the channel got removed. {AddedChannels, NewStatus}. reply_pending_channel_health_check_callers( From c355c9ad500b63de5dcc2774436d4f1afb7ef6a6 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 22 May 2024 17:22:55 +0200 Subject: [PATCH 21/55] fix(dsrepl): properly handle transaction abort during forget site --- .../src/emqx_ds_replication_layer_meta.erl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl index fa53ecced..387dddbcf 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl @@ -694,12 +694,12 @@ ensure_site() -> forget_node(Node) -> Sites = node_sites(Node), - Results = transaction(fun lists:map/2, [fun ?MODULE:forget_site_trans/1, Sites]), - case [Reason || {error, Reason} <- Results] of - [] -> + Result = transaction(fun lists:map/2, [fun ?MODULE:forget_site_trans/1, Sites]), + case Result of + Ok when is_list(Ok) -> ok; - Errors -> - logger:error("Failed to forget leaving node ~p: ~p", [Node, Errors]) + {error, Reason} -> + logger:error("Failed to forget leaving node ~p: ~p", [Node, Reason]) end. %% @doc Returns sorted list of sites shards are replicated across. From e6c5c1b598c3df95f5c7a256ec7e3b864ed0aa00 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 22 May 2024 17:24:08 +0200 Subject: [PATCH 22/55] chore(dsrepl): provide more information in rebalancing log messages --- .../emqx_ds_replication_shard_allocator.erl | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl index 6d8db94e3..d198b2ddd 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl @@ -229,6 +229,7 @@ handle_transition(DB, Shard, Trans, Handler) -> domain => [emqx, ds, DB, shard_transition] }), ?tp( + debug, dsrepl_shard_transition_begin, #{shard => Shard, db => DB, transition => Trans, pid => self()} ), @@ -240,7 +241,12 @@ apply_handler(Fun, DB, Shard, Trans) -> erlang:apply(Fun, [DB, Shard, Trans]). trans_add_local(DB, Shard, {add, Site}) -> - logger:info(#{msg => "Adding new local shard replica", site => Site}), + logger:info(#{ + msg => "Adding new local shard replica", + site => Site, + db => DB, + shard => Shard + }), do_add_local(membership, DB, Shard). do_add_local(membership = Stage, DB, Shard) -> @@ -251,6 +257,8 @@ do_add_local(membership = Stage, DB, Shard) -> {error, recoverable, Reason} -> logger:warning(#{ msg => "Shard membership change failed", + db => DB, + shard => Shard, reason => Reason, retry_in => ?TRANS_RETRY_TIMEOUT }), @@ -261,10 +269,12 @@ do_add_local(readiness = Stage, DB, Shard) -> LocalServer = emqx_ds_replication_layer_shard:local_server(DB, Shard), case emqx_ds_replication_layer_shard:server_info(readiness, LocalServer) of ready -> - logger:info(#{msg => "Local shard replica ready"}); + logger:info(#{msg => "Local shard replica ready", db => DB, shard => Shard}); Status -> logger:warning(#{ msg => "Still waiting for local shard replica to be ready", + db => DB, + shard => Shard, status => Status, retry_in => ?TRANS_RETRY_TIMEOUT }), @@ -273,7 +283,12 @@ do_add_local(readiness = Stage, DB, Shard) -> end. trans_drop_local(DB, Shard, {del, Site}) -> - logger:info(#{msg => "Dropping local shard replica", site => Site}), + logger:info(#{ + msg => "Dropping local shard replica", + site => Site, + db => DB, + shard => Shard + }), do_drop_local(DB, Shard). do_drop_local(DB, Shard) -> @@ -293,17 +308,24 @@ do_drop_local(DB, Shard) -> end. trans_rm_unresponsive(DB, Shard, {del, Site}) -> - logger:info(#{msg => "Removing unresponsive shard replica", site => Site}), + logger:info(#{ + msg => "Removing unresponsive shard replica", + site => Site, + db => DB, + shard => Shard + }), do_rm_unresponsive(DB, Shard, Site). do_rm_unresponsive(DB, Shard, Site) -> Server = emqx_ds_replication_layer_shard:shard_server(DB, Shard, Site), case emqx_ds_replication_layer_shard:remove_server(DB, Shard, Server) of ok -> - logger:info(#{msg => "Unresponsive shard replica removed"}); + logger:info(#{msg => "Unresponsive shard replica removed", db => DB, shard => Shard}); {error, recoverable, Reason} -> logger:warning(#{ msg => "Shard membership change failed", + db => DB, + shard => Shard, reason => Reason, retry_in => ?TRANS_RETRY_TIMEOUT }), @@ -341,6 +363,7 @@ handle_exit(Pid, Reason, State0 = #{db := DB, transitions := Ts}) -> case maps:to_list(maps:filter(fun(_, TH) -> TH#transhdl.pid == Pid end, Ts)) of [{Track, #transhdl{shard = Shard, trans = Trans}}] -> ?tp( + debug, dsrepl_shard_transition_end, #{shard => Shard, db => DB, transition => Trans, pid => Pid, reason => Reason} ), @@ -361,9 +384,10 @@ handle_transition_exit(Shard, Trans, normal, State = #{db := DB}) -> State; handle_transition_exit(_Shard, _Trans, {shutdown, skipped}, State) -> State; -handle_transition_exit(Shard, Trans, Reason, State) -> +handle_transition_exit(Shard, Trans, Reason, State = #{db := DB}) -> logger:warning(#{ msg => "Shard membership transition failed", + db => DB, shard => Shard, transition => Trans, reason => Reason, From 4b540e3bd005a0008729f93e0561ee6d572e6e56 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Wed, 22 May 2024 17:43:07 +0200 Subject: [PATCH 23/55] fix: do not leak action configurations in alarm messages --- apps/emqx_resource/src/emqx_resource_manager.erl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 70dac17ae..7b9e13b16 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -1431,9 +1431,13 @@ maybe_alarm(_Status, _ResId, Error, Error) -> maybe_alarm(_Status, ResId, Error, _PrevError) -> HrError = case Error of - {error, undefined} -> <<"Unknown reason">>; - {error, Reason} -> emqx_utils:readable_error_msg(Reason); - _ -> emqx_utils:readable_error_msg(Error) + {error, undefined} -> + <<"Unknown reason">>; + {error, Reason} -> + emqx_utils:readable_error_msg(Reason); + _ -> + Error1 = redact_config_from_error_status(Error), + emqx_utils:readable_error_msg(Error1) end, emqx_alarm:safe_activate( ResId, @@ -1442,6 +1446,11 @@ maybe_alarm(_Status, ResId, Error, _PrevError) -> ), ?tp(resource_activate_alarm, #{resource_id => ResId}). +redact_config_from_error_status(#{config := _} = ErrorStatus) -> + maps:remove(config, ErrorStatus); +redact_config_from_error_status(Error) -> + Error. + -spec maybe_resume_resource_workers(resource_id(), resource_status()) -> ok. maybe_resume_resource_workers(ResId, ?status_connected) -> lists:foreach( From 8016e9adf43de5027c3704eb4a0c5ed2002ba76c Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 22 May 2024 13:00:40 +0200 Subject: [PATCH 24/55] fix(s3-bridge): restore backward config compatibility --- .../src/emqx_bridge_s3_upload.erl | 33 +++++++++---- scripts/conf-test/old-confs/e5.6.1.conf | 49 +++++++++++++++++++ 2 files changed, 72 insertions(+), 10 deletions(-) create mode 100644 scripts/conf-test/old-confs/e5.6.1.conf diff --git a/apps/emqx_bridge_s3/src/emqx_bridge_s3_upload.erl b/apps/emqx_bridge_s3/src/emqx_bridge_s3_upload.erl index 00fd6fdc0..8b7c3216e 100644 --- a/apps/emqx_bridge_s3/src/emqx_bridge_s3_upload.erl +++ b/apps/emqx_bridge_s3/src/emqx_bridge_s3_upload.erl @@ -63,10 +63,14 @@ fields(action) -> fields(?ACTION) -> emqx_bridge_v2_schema:make_producer_action_schema( hoconsc:mk( - mkunion(mode, #{ - <<"direct">> => ?R_REF(s3_direct_upload_parameters), - <<"aggregated">> => ?R_REF(s3_aggregated_upload_parameters) - }), + mkunion( + mode, + #{ + <<"direct">> => ?R_REF(s3_direct_upload_parameters), + <<"aggregated">> => ?R_REF(s3_aggregated_upload_parameters) + }, + <<"direct">> + ), #{ required => true, desc => ?DESC(s3_upload), @@ -87,7 +91,7 @@ fields(s3_direct_upload_parameters) -> hoconsc:mk( direct, #{ - required => true, + default => <<"direct">>, desc => ?DESC(s3_direct_upload_mode) } )}, @@ -187,13 +191,22 @@ fields(s3_upload_resource_opts) -> ]). mkunion(Field, Schemas) -> - hoconsc:union(fun(Arg) -> scunion(Field, Schemas, Arg) end). + mkunion(Field, Schemas, none). -scunion(_Field, Schemas, all_union_members) -> +mkunion(Field, Schemas, Default) -> + hoconsc:union(fun(Arg) -> scunion(Field, Schemas, Default, Arg) end). + +scunion(_Field, Schemas, _Default, all_union_members) -> maps:values(Schemas); -scunion(Field, Schemas, {value, Value}) -> - Selector = maps:get(emqx_utils_conv:bin(Field), Value, undefined), - case Selector == undefined orelse maps:find(emqx_utils_conv:bin(Selector), Schemas) of +scunion(Field, Schemas, Default, {value, Value}) -> + Selector = + case maps:get(emqx_utils_conv:bin(Field), Value, undefined) of + undefined -> + Default; + X -> + emqx_utils_conv:bin(X) + end, + case maps:find(Selector, Schemas) of {ok, Schema} -> [Schema]; _Error -> diff --git a/scripts/conf-test/old-confs/e5.6.1.conf b/scripts/conf-test/old-confs/e5.6.1.conf new file mode 100644 index 000000000..999230a32 --- /dev/null +++ b/scripts/conf-test/old-confs/e5.6.1.conf @@ -0,0 +1,49 @@ +node { + name = "emqx@127.0.0.1" + cookie = "emqxsecretcookie" + data_dir = "data" +} + +actions { + s3 { + s3direct { + connector = s3local + enable = true + parameters { + acl = private + bucket = direct + content = "${.}" + headers {} + key = "${clientid}/${id}" + } + resource_opts { + health_check_interval = 15s + inflight_window = 100 + max_buffer_bytes = 256MB + query_mode = async + request_ttl = 45s + worker_pool_size = 16 + } + } + } +} +connectors { + s3 { + s3local { + access_key_id = ACCESS + host = localhost + port = 9000 + resource_opts {health_check_interval = 15s, start_timeout = 5s} + secret_access_key = SECRET + transport_options { + connect_timeout = 15s + enable_pipelining = 100 + headers {} + ipv6_probe = false + pool_size = 8 + pool_type = random + ssl {enable = false, verify = verify_peer} + } + } + } +} From 4580906405757caae438cc5260c2e0b18f651cb4 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Tue, 21 May 2024 17:40:18 +0200 Subject: [PATCH 25/55] fix(ds): Use erpc instead of gen_rpc for `delete_next' --- apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl b/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl index 62ea33c3e..a53791feb 100644 --- a/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl +++ b/apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl @@ -179,8 +179,7 @@ make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, StartTime) -> | {ok, end_of_stream} | {error, _}. delete_next(Node, DB, Shard, Iter, Selector, BatchSize) -> - emqx_rpc:call( - Shard, + erpc:call( Node, emqx_ds_replication_layer, do_delete_next_v4, From c6fc76e3355ab3caa5ec694440e396740a96dffe Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sat, 18 May 2024 15:53:21 +0200 Subject: [PATCH 26/55] fix(ds): Perform read operations on the leader. --- apps/emqx_durable_storage/README.md | 2 + .../src/emqx_ds_replication_layer.erl | 98 ++++++++++++++----- .../src/emqx_ds_replication_layer_shard.erl | 48 ++++----- .../test/emqx_ds_replication_SUITE.erl | 15 ++- 4 files changed, 104 insertions(+), 59 deletions(-) diff --git a/apps/emqx_durable_storage/README.md b/apps/emqx_durable_storage/README.md index f67cc3e24..362ad47a3 100644 --- a/apps/emqx_durable_storage/README.md +++ b/apps/emqx_durable_storage/README.md @@ -124,6 +124,8 @@ The following application environment variables are available: - `emqx_durable_storage.egress_flush_interval`: period at which the batches of messages are committed to the durable storage. +- `emqx_durable_storage.reads`: `leader_preferred` | `local_preferred`. + Runtime settings for the durable storages can be modified via CLI as well as the REST API. The following CLI commands are available: diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl index 9330e0b1a..f3b5fdedf 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl @@ -561,12 +561,27 @@ list_nodes() -> %% Too large for normal operation, need better backpressure mechanism. -define(RA_TIMEOUT, 60 * 1000). --define(SAFERPC(EXPR), +-define(SAFE_ERPC(EXPR), try EXPR catch - error:RPCError = {erpc, _} -> - {error, recoverable, RPCError} + error:RPCError__ = {erpc, _} -> + {error, recoverable, RPCError__} + end +). + +-define(SHARD_RPC(DB, SHARD, NODE, BODY), + case + emqx_ds_replication_layer_shard:servers( + DB, SHARD, application:get_env(emqx_durable_storage, reads, leader_preferred) + ) + of + [{_, NODE} | _] -> + begin + BODY + end; + [] -> + {error, recoverable, replica_offline} end ). @@ -623,44 +638,79 @@ ra_drop_generation(DB, Shard, GenId) -> end. ra_get_streams(DB, Shard, TopicFilter, Time) -> - {_, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), TimestampUs = timestamp_to_timeus(Time), - ?SAFERPC(emqx_ds_proto_v4:get_streams(Node, DB, Shard, TopicFilter, TimestampUs)). + ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC(emqx_ds_proto_v4:get_streams(Node, DB, Shard, TopicFilter, TimestampUs)) + ). ra_get_delete_streams(DB, Shard, TopicFilter, Time) -> - {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - ?SAFERPC(emqx_ds_proto_v4:get_delete_streams(Node, DB, Shard, TopicFilter, Time)). + ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC(emqx_ds_proto_v4:get_delete_streams(Node, DB, Shard, TopicFilter, Time)) + ). ra_make_iterator(DB, Shard, Stream, TopicFilter, StartTime) -> - {_, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), TimeUs = timestamp_to_timeus(StartTime), - ?SAFERPC(emqx_ds_proto_v4:make_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)). + ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC(emqx_ds_proto_v4:make_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)) + ). ra_make_delete_iterator(DB, Shard, Stream, TopicFilter, StartTime) -> - {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), TimeUs = timestamp_to_timeus(StartTime), - ?SAFERPC(emqx_ds_proto_v4:make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs)). + ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC( + emqx_ds_proto_v4:make_delete_iterator(Node, DB, Shard, Stream, TopicFilter, TimeUs) + ) + ). ra_update_iterator(DB, Shard, Iter, DSKey) -> - {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - ?SAFERPC(emqx_ds_proto_v4:update_iterator(Node, DB, Shard, Iter, DSKey)). + ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC(emqx_ds_proto_v4:update_iterator(Node, DB, Shard, Iter, DSKey)) + ). ra_next(DB, Shard, Iter, BatchSize) -> - {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - case emqx_ds_proto_v4:next(Node, DB, Shard, Iter, BatchSize) of - RPCError = {badrpc, _} -> - {error, recoverable, RPCError}; - Other -> - Other - end. + ?SHARD_RPC( + DB, + Shard, + Node, + case emqx_ds_proto_v4:next(Node, DB, Shard, Iter, BatchSize) of + Err = {badrpc, _} -> + {error, recoverable, Err}; + Ret -> + Ret + end + ). ra_delete_next(DB, Shard, Iter, Selector, BatchSize) -> - {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - emqx_ds_proto_v4:delete_next(Node, DB, Shard, Iter, Selector, BatchSize). + ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC(emqx_ds_proto_v4:delete_next(Node, DB, Shard, Iter, Selector, BatchSize)) + ). ra_list_generations_with_lifetimes(DB, Shard) -> - {_Name, Node} = emqx_ds_replication_layer_shard:server(DB, Shard, local_preferred), - case ?SAFERPC(emqx_ds_proto_v4:list_generations_with_lifetimes(Node, DB, Shard)) of + Reply = ?SHARD_RPC( + DB, + Shard, + Node, + ?SAFE_ERPC(emqx_ds_proto_v4:list_generations_with_lifetimes(Node, DB, Shard)) + ), + case Reply of Gens = #{} -> maps:map( fun(_GenId, Data = #{since := Since, until := Until}) -> diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl index 0bfa89e95..518e1e630 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl @@ -28,8 +28,7 @@ %% Dynamic server location API -export([ - servers/3, - server/3 + servers/3 ]). %% Membership @@ -83,16 +82,15 @@ server_name(DB, Shard, Site) -> %% --spec servers(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), Order) -> [server(), ...] when - Order :: leader_preferred | undefined. -servers(DB, Shard, _Order = leader_preferred) -> +-spec servers(emqx_ds:db(), emqx_ds_replication_layer:shard_id(), Order) -> [server()] when + Order :: leader_preferred | local_preferred | undefined. +servers(DB, Shard, leader_preferred) -> get_servers_leader_preferred(DB, Shard); +servers(DB, Shard, local_preferred) -> + get_servers_local_preferred(DB, Shard); servers(DB, Shard, _Order = undefined) -> get_shard_servers(DB, Shard). -server(DB, Shard, _Which = local_preferred) -> - get_server_local_preferred(DB, Shard). - get_servers_leader_preferred(DB, Shard) -> %% NOTE: Contact last known leader first, then rest of shard servers. ClusterName = get_cluster_name(DB, Shard), @@ -104,17 +102,24 @@ get_servers_leader_preferred(DB, Shard) -> get_online_servers(DB, Shard) end. -get_server_local_preferred(DB, Shard) -> - %% NOTE: Contact either local server or a random replica. +get_servers_local_preferred(DB, Shard) -> + %% Return list of servers, where the local replica (if exists) is + %% the first element. Note: result is _NOT_ shuffled. This can be + %% bad for the load balancing, but it makes results more + %% deterministic. Caller that doesn't care about that can shuffle + %% the results by itself. ClusterName = get_cluster_name(DB, Shard), case ra_leaderboard:lookup_members(ClusterName) of - Servers when is_list(Servers) -> - pick_local(Servers); undefined -> - %% TODO - %% Leader is unkonwn if there are no servers of this group on the - %% local node. We want to pick a replica in that case as well. - pick_random(get_online_servers(DB, Shard)) + Servers = get_online_servers(DB, Shard); + Servers when is_list(Servers) -> + ok + end, + case lists:keyfind(node(), 2, Servers) of + false -> + Servers; + Local when is_tuple(Local) -> + [Local | lists:delete(Local, Servers)] end. lookup_leader(DB, Shard) -> @@ -139,17 +144,6 @@ filter_online(Servers) -> is_server_online({_Name, Node}) -> Node == node() orelse lists:member(Node, nodes()). -pick_local(Servers) -> - case lists:keyfind(node(), 2, Servers) of - Local when is_tuple(Local) -> - Local; - false -> - pick_random(Servers) - end. - -pick_random(Servers) -> - lists:nth(rand:uniform(length(Servers)), Servers). - get_cluster_name(DB, Shard) -> memoize(fun cluster_name/2, [DB, Shard]). diff --git a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl index 8303ff861..eacf7e301 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl @@ -479,11 +479,13 @@ t_rebalance_offline_restarts(Config) -> %% shard_server_info(Node, DB, Shard, Site, Info) -> - Server = shard_server(Node, DB, Shard, Site), - {Server, ds_repl_shard(Node, server_info, [Info, Server])}. - -shard_server(Node, DB, Shard, Site) -> - ds_repl_shard(Node, shard_server, [DB, Shard, Site]). + ?ON( + Node, + begin + Server = emqx_ds_replication_layer_shard:shard_server(DB, Shard, Site), + {Server, emqx_ds_replication_layer_shard:server_info(Info, Server)} + end + ). ds_repl_meta(Node, Fun) -> ds_repl_meta(Node, Fun, []). @@ -499,9 +501,6 @@ ds_repl_meta(Node, Fun, Args) -> error(meta_op_failed) end. -ds_repl_shard(Node, Fun, Args) -> - erpc:call(Node, emqx_ds_replication_layer_shard, Fun, Args). - shards(Node, DB) -> erpc:call(Node, emqx_ds_replication_layer_meta, shards, [DB]). From aca2d9586c0d91d81ff1714f0b4d9fe71c12f759 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sat, 18 May 2024 15:53:53 +0200 Subject: [PATCH 27/55] fix(ds): Fix return type of drop_generation --- apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index fdba6335f..438955367 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -513,7 +513,7 @@ add_generation(ShardId, Since) -> list_generations_with_lifetimes(ShardId) -> gen_server:call(?REF(ShardId), #call_list_generations_with_lifetimes{}, infinity). --spec drop_generation(shard_id(), gen_id()) -> ok. +-spec drop_generation(shard_id(), gen_id()) -> ok | {error, _}. drop_generation(ShardId, GenId) -> gen_server:call(?REF(ShardId), #call_drop_generation{gen_id = GenId}, infinity). From 1526c527d0287ae671d5e1593196a10241988546 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sat, 18 May 2024 15:54:25 +0200 Subject: [PATCH 28/55] fix(ds): Log generation operations --- .../src/emqx_ds_replication_layer.erl | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl index f3b5fdedf..ee4e71812 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl @@ -761,6 +761,14 @@ apply( #{?tag := add_generation, ?since := Since}, #{db_shard := DBShard, latest := Latest0} = State0 ) -> + ?tp( + info, + ds_replication_layer_add_generation, + #{ + shard => DBShard, + since => Since + } + ), {Timestamp, Latest} = ensure_monotonic_timestamp(Since, Latest0), Result = emqx_ds_storage_layer:add_generation(DBShard, Timestamp), State = State0#{latest := Latest}, @@ -771,6 +779,15 @@ apply( #{?tag := update_config, ?since := Since, ?config := Opts}, #{db_shard := DBShard, latest := Latest0} = State0 ) -> + ?tp( + notice, + ds_replication_layer_update_config, + #{ + shard => DBShard, + config => Opts, + since => Since + } + ), {Timestamp, Latest} = ensure_monotonic_timestamp(Since, Latest0), Result = emqx_ds_storage_layer:update_config(DBShard, Timestamp, Opts), State = State0#{latest := Latest}, @@ -780,6 +797,14 @@ apply( #{?tag := drop_generation, ?generation := GenId}, #{db_shard := DBShard} = State ) -> + ?tp( + info, + ds_replication_layer_drop_generation, + #{ + shard => DBShard, + generation => GenId + } + ), Result = emqx_ds_storage_layer:drop_generation(DBShard, GenId), {State, Result}; apply( From e4a73f003a8f7ee5272de3f5c8fb6a9a6c4eca44 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 19 May 2024 09:20:16 +0200 Subject: [PATCH 29/55] feat(ds): Implement format_status callback Reduce volume of logs and crash reports from DS --- .../src/emqx_ds_replication_layer_egress.erl | 9 ++++++++- .../src/emqx_ds_storage_layer.erl | 20 ++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl index 9201ccf04..dc76aecf0 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl @@ -33,7 +33,7 @@ -export([start_link/2, store_batch/3]). %% behavior callbacks: --export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). +-export([init/1, format_status/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). %% internal exports: -export([]). @@ -129,6 +129,13 @@ init([DB, Shard]) -> }, {ok, S}. +format_status(#s{db = DB, shard = Shard, queue = Q}) -> + #{ + db => DB, + shard => Shard, + queue => queue:len(Q) + }. + handle_call( #enqueue_req{ messages = Msgs, diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 438955367..57930fa72 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -52,7 +52,7 @@ ]). %% gen_server --export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). +-export([init/1, format_status/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). %% internal exports: -export([db_dir/1]). @@ -586,6 +586,24 @@ init({ShardId, Options}) -> commit_metadata(S), {ok, S}. +format_status(#s{shard_id = ShardId, db = DB, cf_refs = CFRefs, schema = Schema, shard = Shard}) -> + #{ + id => ShardId, + db => DB, + cf_refs => CFRefs, + schema => Schema, + shard => + maps:map( + fun + (?GEN_KEY(_), _Schema) -> + '...'; + (_K, Val) -> + Val + end, + Shard + ) + }. + handle_call(#call_update_config{since = Since, options = Options}, _From, S0) -> case handle_update_config(S0, Since, Options) of S = #s{} -> From 074d98a14a53c51ff94170847560ee00fc38bed6 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 19 May 2024 09:24:02 +0200 Subject: [PATCH 30/55] test(ds): Refactor ds_SUITE --- apps/emqx_durable_storage/test/emqx_ds_SUITE.erl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl index 49742fdc6..eb14456cb 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -67,10 +67,16 @@ t_00_smoke_open_drop(_Config) -> %% A simple smoke test that verifies that storing the messages doesn't %% crash t_01_smoke_store(_Config) -> - DB = default, - ?assertMatch(ok, emqx_ds:open_db(DB, opts())), - Msg = message(<<"foo/bar">>, <<"foo">>, 0), - ?assertMatch(ok, emqx_ds:store_batch(DB, [Msg])). + ?check_trace( + #{timetrap => 10_000}, + begin + DB = default, + ?assertMatch(ok, emqx_ds:open_db(DB, opts())), + Msg = message(<<"foo/bar">>, <<"foo">>, 0), + ?assertMatch(ok, emqx_ds:store_batch(DB, [Msg])) + end, + [] + ). %% A simple smoke test that verifies that getting the list of streams %% doesn't crash and that iterators can be opened. From eb7c43ee9df3014ddda865e20f06b74230615508 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 19 May 2024 13:20:12 +0200 Subject: [PATCH 31/55] fix(ds): Always store messages in the current generation --- .../src/emqx_ds_storage_layer.erl | 5 +++-- .../test/emqx_ds_storage_SUITE.erl | 21 +------------------ 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 57930fa72..d36d8e96f 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -297,13 +297,14 @@ store_batch(Shard, Messages, Options) -> [{emqx_ds:time(), emqx_types:message()}], emqx_ds:message_store_opts() ) -> {ok, cooked_batch()} | ignore | emqx_ds:error(_). -prepare_batch(Shard, Messages = [{Time, _Msg} | _], Options) -> +prepare_batch(Shard, Messages = [_ | _], Options) -> %% NOTE %% We assume that batches do not span generations. Callers should enforce this. ?tp(emqx_ds_storage_layer_prepare_batch, #{ shard => Shard, messages => Messages, options => Options }), - {GenId, #{module := Mod, data := GenData}} = generation_at(Shard, Time), + GenId = generation_current(Shard), + #{module := Mod, data := GenData} = generation_get(Shard, GenId), T0 = erlang:monotonic_time(microsecond), Result = case Mod:prepare_batch(Shard, GenData, Messages, Options) of diff --git a/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl index 39158c7ef..dad18f89e 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_storage_SUITE.erl @@ -27,25 +27,6 @@ opts() -> %% -t_idempotent_store_batch(_Config) -> - Shard = {?FUNCTION_NAME, _ShardId = <<"42">>}, - {ok, Pid} = emqx_ds_storage_layer:start_link(Shard, opts()), - %% Push some messages to the shard. - Msgs1 = [gen_message(N) || N <- lists:seq(10, 20)], - GenTs = 30, - Msgs2 = [gen_message(N) || N <- lists:seq(40, 50)], - ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs1), #{})), - %% Add new generation and push the same batch + some more. - ?assertEqual(ok, emqx_ds_storage_layer:add_generation(Shard, GenTs)), - ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs1), #{})), - ?assertEqual(ok, emqx_ds_storage_layer:store_batch(Shard, batch(Msgs2), #{})), - %% First batch should have been handled idempotently. - ?assertEqual( - Msgs1 ++ Msgs2, - lists:keysort(#message.timestamp, emqx_ds_test_helpers:storage_consume(Shard, ['#'])) - ), - ok = stop_shard(Pid). - t_snapshot_take_restore(_Config) -> Shard = {?FUNCTION_NAME, _ShardId = <<"42">>}, {ok, Pid} = emqx_ds_storage_layer:start_link(Shard, opts()), @@ -77,7 +58,7 @@ t_snapshot_take_restore(_Config) -> %% Verify that the restored shard contains the messages up until the snapshot. {ok, _Pid} = emqx_ds_storage_layer:start_link(Shard, opts()), - ?assertEqual( + snabbkaffe_diff:assert_lists_eq( Msgs1 ++ Msgs2, lists:keysort(#message.timestamp, emqx_ds_test_helpers:storage_consume(Shard, ['#'])) ). From acdae4fba38f235cf860b3801f5532bd82678498 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 19 May 2024 13:21:10 +0200 Subject: [PATCH 32/55] fix(ds): Workaround for the idempotency error when dropping gens --- .../src/emqx_ds_storage_layer.erl | 70 ++++++++++++------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index d36d8e96f..68e2f4597 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -564,6 +564,7 @@ start_link(Shard = {_, _}, Options) -> init({ShardId, Options}) -> process_flag(trap_exit, true), + ?tp(info, ds_storage_init, #{shard => ShardId}), logger:set_process_metadata(#{shard_id => ShardId, domain => [ds, storage_layer, shard]}), erase_schema_runtime(ShardId), clear_all_checkpoints(ShardId), @@ -777,18 +778,31 @@ handle_drop_generation(S0, GenId) -> } = S0, #{module := Mod, cf_refs := GenCFRefs} = GenSchema, #{?GEN_KEY(GenId) := #{data := RuntimeData}} = OldShard, - case Mod:drop(ShardId, DB, GenId, GenCFRefs, RuntimeData) of - ok -> - CFRefs = OldCFRefs -- GenCFRefs, - Shard = maps:remove(?GEN_KEY(GenId), OldShard), - Schema = maps:remove(?GEN_KEY(GenId), OldSchema), - S = S0#s{ - cf_refs = CFRefs, - shard = Shard, - schema = Schema - }, - {ok, S} - end. + try + Mod:drop(ShardId, DB, GenId, GenCFRefs, RuntimeData) + catch + EC:Err:Stack -> + ?tp( + error, + ds_storage_layer_failed_to_drop_generation, + #{ + shard => ShardId, + EC => Err, + stacktrace => Stack, + generation => GenId, + s => format_status(S0) + } + ) + end, + CFRefs = OldCFRefs -- GenCFRefs, + Shard = maps:remove(?GEN_KEY(GenId), OldShard), + Schema = maps:remove(?GEN_KEY(GenId), OldSchema), + S = S0#s{ + cf_refs = CFRefs, + shard = Shard, + schema = Schema + }, + {ok, S}. -spec open_generation(shard_id(), rocksdb:db_handle(), cf_refs(), gen_id(), generation_schema()) -> generation(). @@ -940,14 +954,18 @@ handle_accept_snapshot(ShardId) -> %% general. %% %% The mechanism of storage layer events should be refined later. --spec handle_event(shard_id(), emqx_ds:time(), CustomEvent | tick) -> [CustomEvent]. +-spec handle_event(shard_id(), emqx_ds:time(), CustomEvent | tick) -> {gen_id(), [CustomEvent]}. handle_event(Shard, Time, Event) -> - {_GenId, #{module := Mod, data := GenData}} = generation_at(Shard, Time), - ?tp(emqx_ds_storage_layer_event, #{mod => Mod, time => Time, event => Event}), - case erlang:function_exported(Mod, handle_event, 4) of - true -> - Mod:handle_event(Shard, GenData, Time, Event); - false -> + case generation_at(Shard, Time) of + {_GenId, #{module := Mod, data := GenData}} -> + ?tp(emqx_ds_storage_layer_event, #{mod => Mod, time => Time, event => Event}), + case erlang:function_exported(Mod, handle_event, 4) of + true -> + Mod:handle_event(Shard, GenData, Time, Event); + false -> + [] + end; + _ -> [] end. @@ -989,12 +1007,16 @@ generation_at(Shard, Time) -> generation_at(Time, Current, Schema). generation_at(Time, GenId, Schema) -> - #{?GEN_KEY(GenId) := Gen} = Schema, - case Gen of - #{since := Since} when Time < Since andalso GenId > 0 -> - generation_at(Time, prev_generation_id(GenId), Schema); + case Schema of + #{?GEN_KEY(GenId) := Gen} -> + case Gen of + #{since := Since} when Time < Since andalso GenId > 0 -> + generation_at(Time, prev_generation_id(GenId), Schema); + _ -> + {GenId, Gen} + end; _ -> - {GenId, Gen} + not_found end. -define(PERSISTENT_TERM(SHARD), {emqx_ds_storage_layer, SHARD}). From 0ff307e7895a1eef2ae12dd02ed087e59994f37e Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 19 May 2024 21:19:57 +0200 Subject: [PATCH 33/55] fix(ds): Include generation ID in the storage events Make sure storage events originating from generation X are handled in the context of the same generation. --- .../src/emqx_ds_storage_layer.erl | 56 ++++++------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 68e2f4597..e93780ba2 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -80,6 +80,10 @@ -define(stream_v2(GENERATION, INNER), [GENERATION | INNER]). -define(delete_stream(GENERATION, INNER), [GENERATION | INNER]). +%% Wrappers for the storage events: +-define(storage_event(GEN_ID, PAYLOAD), #{0 := 3333, 1 := GEN_ID, 2 := PAYLOAD}). +-define(mk_storage_event(GEN_ID, PAYLOAD), #{0 => 3333, 1 => GEN_ID, 2 => PAYLOAD}). + %%================================================================================ %% Type declarations %%================================================================================ @@ -848,10 +852,6 @@ new_generation(ShardId, DB, Schema0, Since) -> next_generation_id(GenId) -> GenId + 1. --spec prev_generation_id(gen_id()) -> gen_id(). -prev_generation_id(GenId) when GenId > 0 -> - GenId - 1. - %% @doc Commit current state of the server to both rocksdb and the persistent term -spec commit_metadata(server_state()) -> ok. commit_metadata(#s{shard_id = ShardId, schema = Schema, shard = Runtime, db = DB}) -> @@ -947,27 +947,23 @@ handle_accept_snapshot(ShardId) -> Dir = db_dir(ShardId), emqx_ds_storage_snapshot:new_writer(Dir). -%% FIXME: currently this interface is a hack to handle safe cutoff -%% timestamp in LTS. It has many shortcomings (can lead to infinite -%% loops if the CBM is not careful; events from one generation may be -%% sent to the next one, etc.) and the API is not well thought out in -%% general. -%% -%% The mechanism of storage layer events should be refined later. --spec handle_event(shard_id(), emqx_ds:time(), CustomEvent | tick) -> {gen_id(), [CustomEvent]}. -handle_event(Shard, Time, Event) -> - case generation_at(Shard, Time) of - {_GenId, #{module := Mod, data := GenData}} -> - ?tp(emqx_ds_storage_layer_event, #{mod => Mod, time => Time, event => Event}), +-spec handle_event(shard_id(), emqx_ds:time(), Event) -> [Event]. +handle_event(Shard, Time, ?storage_event(GenId, Event)) -> + case generation_get(Shard, GenId) of + not_found -> + []; + #{module := Mod, data := GenData} -> case erlang:function_exported(Mod, handle_event, 4) of true -> - Mod:handle_event(Shard, GenData, Time, Event); + NewEvents = Mod:handle_event(Shard, GenData, Time, Event), + [?mk_storage_event(GenId, E) || E <- NewEvents]; false -> [] - end; - _ -> - [] - end. + end + end; +handle_event(Shard, Time, Event) -> + GenId = generation_current(Shard), + handle_event(Shard, Time, ?mk_storage_event(GenId, Event)). %%-------------------------------------------------------------------------------- %% Schema access @@ -1001,24 +997,6 @@ generations_since(Shard, Since) -> Schema ). --spec generation_at(shard_id(), emqx_ds:time()) -> {gen_id(), generation()}. -generation_at(Shard, Time) -> - Schema = #{current_generation := Current} = get_schema_runtime(Shard), - generation_at(Time, Current, Schema). - -generation_at(Time, GenId, Schema) -> - case Schema of - #{?GEN_KEY(GenId) := Gen} -> - case Gen of - #{since := Since} when Time < Since andalso GenId > 0 -> - generation_at(Time, prev_generation_id(GenId), Schema); - _ -> - {GenId, Gen} - end; - _ -> - not_found - end. - -define(PERSISTENT_TERM(SHARD), {emqx_ds_storage_layer, SHARD}). -spec get_schema_runtime(shard_id()) -> shard(). From 60edf5e9b8cf16f02657840a3289b32c960b4fb7 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Sun, 19 May 2024 23:12:46 +0200 Subject: [PATCH 34/55] fix(ds): Move responsibility of returning end_of_stream to the CBM --- .../src/emqx_ds_storage_bitfield_lts.erl | 39 +++++++++++++------ .../src/emqx_ds_storage_layer.erl | 15 +++---- .../src/emqx_ds_storage_reference.erl | 11 ++++-- 3 files changed, 41 insertions(+), 24 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index ebbcde17c..8acb6e529 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -35,7 +35,7 @@ make_iterator/5, make_delete_iterator/5, update_iterator/4, - next/5, + next/6, delete_next/6, post_creation_actions/1, @@ -424,23 +424,21 @@ next( Schema = #s{ts_offset = TSOffset, ts_bits = TSBits}, It = #{?storage_key := Stream}, BatchSize, - Now + Now, + IsCurrent ) -> init_counters(), %% Compute safe cutoff time. It's the point in time where the last %% complete epoch ends, so we need to know the current time to %% compute it. This is needed because new keys can be added before %% the iterator. - IsWildcard = + %% + %% This is needed to avoid situations when the iterator advances + %% to position k1, and then a new message with k2, such that k2 < + %% k1 is inserted. k2 would be missed. + HasCutoff = case Stream of - {_StaticKey, []} -> false; - _ -> true - end, - SafeCutoffTime = - case IsWildcard of - true -> - (Now bsr TSOffset) bsl TSOffset; - false -> + {_StaticKey, []} -> %% Iterators scanning streams without varying topic %% levels can operate on incomplete epochs, since new %% matching keys for the single topic are added in @@ -450,10 +448,27 @@ next( %% filters operating on streams with varying parts: %% iterator can jump to the next topic and then it %% won't backtrack. + false; + _ -> + %% New batches are only added to the current + %% generation. We can ignore cutoff time for old + %% generations: + IsCurrent + end, + SafeCutoffTime = + case HasCutoff of + true -> + (Now bsr TSOffset) bsl TSOffset; + false -> 1 bsl TSBits - 1 end, try - next_until(Schema, It, SafeCutoffTime, BatchSize) + case next_until(Schema, It, SafeCutoffTime, BatchSize) of + {ok, _, []} when not IsCurrent -> + {ok, end_of_stream}; + Result -> + Result + end after report_counters(Shard) end. diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index e93780ba2..71bf6fa6e 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -248,8 +248,8 @@ ) -> emqx_ds:make_delete_iterator_result(_Iterator). --callback next(shard_id(), _Data, Iter, pos_integer(), emqx_ds:time()) -> - {ok, Iter, [emqx_types:message()]} | {error, _}. +-callback next(shard_id(), _Data, Iter, pos_integer(), emqx_ds:time(), _IsCurrent :: boolean()) -> + {ok, Iter, [emqx_types:message()]} | {ok, end_of_stream} | {error, _}. -callback delete_next( shard_id(), _Data, DeleteIterator, emqx_ds:delete_selector(), pos_integer(), emqx_ds:time() @@ -449,15 +449,12 @@ update_iterator( next(Shard, Iter = #{?tag := ?IT, ?generation := GenId, ?enc := GenIter0}, BatchSize, Now) -> case generation_get(Shard, GenId) of #{module := Mod, data := GenData} -> - Current = generation_current(Shard), - case Mod:next(Shard, GenData, GenIter0, BatchSize, Now) of - {ok, _GenIter, []} when GenId < Current -> - %% This is a past generation. Storage layer won't write - %% any more messages here. The iterator reached the end: - %% the stream has been fully replayed. - {ok, end_of_stream}; + IsCurrent = GenId =:= generation_current(Shard), + case Mod:next(Shard, GenData, GenIter0, BatchSize, Now, IsCurrent) of {ok, GenIter, Batch} -> {ok, Iter#{?enc := GenIter}, Batch}; + {ok, end_of_stream} -> + {ok, end_of_stream}; Error = {error, _, _} -> Error end; diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl index 10007488c..1c506390e 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_reference.erl @@ -38,7 +38,7 @@ make_iterator/5, make_delete_iterator/5, update_iterator/4, - next/5, + next/6, delete_next/6 ]). @@ -148,7 +148,7 @@ update_iterator(_Shard, _Data, OldIter, DSKey) -> last_seen_message_key = DSKey }}. -next(_Shard, #s{db = DB, cf = CF}, It0, BatchSize, _Now) -> +next(_Shard, #s{db = DB, cf = CF}, It0, BatchSize, _Now, IsCurrent) -> #it{topic_filter = TopicFilter, start_time = StartTime, last_seen_message_key = Key0} = It0, {ok, ITHandle} = rocksdb:iterator(DB, CF, []), Action = @@ -162,7 +162,12 @@ next(_Shard, #s{db = DB, cf = CF}, It0, BatchSize, _Now) -> {Key, Messages} = do_next(TopicFilter, StartTime, ITHandle, Action, BatchSize, Key0, []), rocksdb:iterator_close(ITHandle), It = It0#it{last_seen_message_key = Key}, - {ok, It, lists:reverse(Messages)}. + case Messages of + [] when not IsCurrent -> + {ok, end_of_stream}; + _ -> + {ok, It, lists:reverse(Messages)} + end. delete_next(_Shard, #s{db = DB, cf = CF}, It0, Selector, BatchSize, _Now) -> #delete_it{ From b3ded7edce091f7d8e7905feb14a3f226ed24180 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 20 May 2024 12:56:53 +0200 Subject: [PATCH 35/55] fix(ds): Fix code review remark --- .../src/emqx_ds_builtin_db_sup.erl | 28 +++++++++++ .../src/emqx_ds_replication_layer_egress.erl | 20 +++++--- .../src/emqx_ds_storage_layer.erl | 47 ++++++++++++------- 3 files changed, 71 insertions(+), 24 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl index 06e925c1b..b2a461e7a 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl @@ -33,6 +33,12 @@ ]). -export([which_dbs/0, which_shards/1]). +%% Debug: +-export([ + get_egress_workers/1, + get_shard_workers/1 +]). + %% behaviour callbacks: -export([init/1]). @@ -111,6 +117,28 @@ which_dbs() -> Key = {n, l, #?db_sup{_ = '_', db = '$1'}}, gproc:select({local, names}, [{{Key, '_', '_'}, [], ['$1']}]). +%% @doc Get pids of all local egress servers for the given DB. +-spec get_egress_workers(emqx_ds:db()) -> #{_Shard => pid()}. +get_egress_workers(DB) -> + Children = supervisor:which_children(?via(#?egress_sup{db = DB})), + L = [{Shard, Child} || {Shard, Child, _, _} <- Children, is_pid(Child)], + maps:from_list(L). + +%% @doc Get pids of all local shard servers for the given DB. +-spec get_shard_workers(emqx_ds:db()) -> #{_Shard => pid()}. +get_shard_workers(DB) -> + Shards = supervisor:which_children(?via(#?shards_sup{db = DB})), + L = lists:flatmap( + fun + ({_Shard, Sup, _, _}) when is_pid(Sup) -> + [{Id, Pid} || {Id, Pid, _, _} <- supervisor:which_children(Sup), is_pid(Pid)]; + (_) -> + [] + end, + Shards + ), + maps:from_list(L). + %%================================================================================ %% behaviour callbacks %%================================================================================ diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl index dc76aecf0..1d0efca6f 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl @@ -129,12 +129,20 @@ init([DB, Shard]) -> }, {ok, S}. -format_status(#s{db = DB, shard = Shard, queue = Q}) -> - #{ - db => DB, - shard => Shard, - queue => queue:len(Q) - }. +format_status(Status) -> + maps:map( + fun + (state, #s{db = DB, shard = Shard, queue = Q}) -> + #{ + db => DB, + shard => Shard, + queue => queue:len(Q) + }; + (_, Val) -> + Val + end, + Status + ). handle_call( #enqueue_req{ diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 71bf6fa6e..7d1fffbcb 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -589,23 +589,16 @@ init({ShardId, Options}) -> commit_metadata(S), {ok, S}. -format_status(#s{shard_id = ShardId, db = DB, cf_refs = CFRefs, schema = Schema, shard = Shard}) -> - #{ - id => ShardId, - db => DB, - cf_refs => CFRefs, - schema => Schema, - shard => - maps:map( - fun - (?GEN_KEY(_), _Schema) -> - '...'; - (_K, Val) -> - Val - end, - Shard - ) - }. +format_status(Status) -> + maps:map( + fun + (state, State) -> + format_state(State); + (_, Val) -> + Val + end, + Status + ). handle_call(#call_update_config{since = Since, options = Options}, _From, S0) -> case handle_update_config(S0, Since, Options) of @@ -791,7 +784,7 @@ handle_drop_generation(S0, GenId) -> EC => Err, stacktrace => Stack, generation => GenId, - s => format_status(S0) + s => format_state(S0) } ) end, @@ -994,6 +987,24 @@ generations_since(Shard, Since) -> Schema ). +format_state(#s{shard_id = ShardId, db = DB, cf_refs = CFRefs, schema = Schema, shard = Shard}) -> + #{ + id => ShardId, + db => DB, + cf_refs => CFRefs, + schema => Schema, + shard => + maps:map( + fun + (?GEN_KEY(_), _Schema) -> + '...'; + (_K, Val) -> + Val + end, + Shard + ) + }. + -define(PERSISTENT_TERM(SHARD), {emqx_ds_storage_layer, SHARD}). -spec get_schema_runtime(shard_id()) -> shard(). From 29345aaa30e7e5f931b12abd55165c2d1ca6489e Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Mon, 20 May 2024 15:53:11 +0200 Subject: [PATCH 36/55] fix(ds): Fix idle event generation in bitfield_lts layout --- apps/emqx/src/emqx_ds_schema.erl | 7 +------ .../src/emqx_ds_storage_bitfield_lts.erl | 8 +++++--- .../test/emqx_ds_replication_SUITE.erl | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/apps/emqx/src/emqx_ds_schema.erl b/apps/emqx/src/emqx_ds_schema.erl index 87f87aa7f..dc395b291 100644 --- a/apps/emqx/src/emqx_ds_schema.erl +++ b/apps/emqx/src/emqx_ds_schema.erl @@ -246,7 +246,7 @@ fields(layout_builtin_reference) -> reference, #{ 'readOnly' => true, - importance => ?IMPORTANCE_HIDDEN + importance => ?IMPORTANCE_LOW } )} ]. @@ -273,17 +273,12 @@ ds_schema(Options) -> Options ). --ifndef(TEST). -builtin_layouts() -> - [ref(layout_builtin_wildcard_optimized)]. --else. builtin_layouts() -> %% Reference layout stores everything in one stream, so it's not %% suitable for production use. However, it's very simple and %% produces a very predictabale replay order, which can be useful %% for testing and debugging: [ref(layout_builtin_wildcard_optimized), ref(layout_builtin_reference)]. --endif. sc(Type, Meta) -> hoconsc:mk(Type, Meta). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index 8acb6e529..7d058109e 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -161,7 +161,7 @@ %% GVar used for idle detection: -define(IDLE_DETECT, idle_detect). --define(EPOCH(S, TS), (TS bsl S#s.ts_bits)). +-define(EPOCH(S, TS), (TS bsr S#s.ts_offset)). -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -458,7 +458,7 @@ next( SafeCutoffTime = case HasCutoff of true -> - (Now bsr TSOffset) bsl TSOffset; + ?EPOCH(Schema, Now) bsl TSOffset; false -> 1 bsl TSBits - 1 end, @@ -561,13 +561,15 @@ handle_event(_ShardId, State = #s{gvars = Gvars}, Time, tick) -> LastWrittenTs = 0 end, case Latch of - false when ?EPOCH(State, Time) > ?EPOCH(State, LastWrittenTs) -> + false when ?EPOCH(State, Time) > ?EPOCH(State, LastWrittenTs) + 1 -> ets:insert(Gvars, {?IDLE_DETECT, true, LastWrittenTs}), [dummy_event]; _ -> [] end; handle_event(_ShardId, _Data, _Time, _Event) -> + %% `dummy_event' goes here and does nothing. But it forces update + %% of `Time' in the replication layer. []. %%================================================================================ diff --git a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl index eacf7e301..941e20641 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl @@ -183,7 +183,7 @@ t_rebalance(Config) -> ], Stream1 = emqx_utils_stream:interleave( [ - {50, Stream0}, + {10, Stream0}, emqx_utils_stream:const(add_generation) ], false From 5c78ecba40abd90e737889ebaf8da8e4016539d2 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Tue, 21 May 2024 19:07:11 +0200 Subject: [PATCH 37/55] docs(ds): Update documentation for the storage layouts --- apps/emqx/src/emqx_ds_schema.erl | 5 ++++- rel/i18n/emqx_ds_schema.hocon | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/apps/emqx/src/emqx_ds_schema.erl b/apps/emqx/src/emqx_ds_schema.erl index dc395b291..5902bcfb7 100644 --- a/apps/emqx/src/emqx_ds_schema.erl +++ b/apps/emqx/src/emqx_ds_schema.erl @@ -246,7 +246,8 @@ fields(layout_builtin_reference) -> reference, #{ 'readOnly' => true, - importance => ?IMPORTANCE_LOW + importance => ?IMPORTANCE_LOW, + desc => ?DESC(layout_builtin_reference_type) } )} ]. @@ -257,6 +258,8 @@ desc(builtin_local_write_buffer) -> ?DESC(builtin_local_write_buffer); desc(layout_builtin_wildcard_optimized) -> ?DESC(layout_builtin_wildcard_optimized); +desc(layout_builtin_reference) -> + ?DESC(layout_builtin_reference); desc(_) -> undefined. diff --git a/rel/i18n/emqx_ds_schema.hocon b/rel/i18n/emqx_ds_schema.hocon index 11d25ebe4..65b76b6fa 100644 --- a/rel/i18n/emqx_ds_schema.hocon +++ b/rel/i18n/emqx_ds_schema.hocon @@ -90,11 +90,20 @@ wildcard_optimized_epoch_bits.desc: Time span covered by each epoch grows exponentially with the value of `epoch_bits`: - - `epoch_bits = 1`: epoch time = 1 millisecond - - `epoch_bits = 2`: 2 milliseconds + - `epoch_bits = 1`: epoch time = 2 microseconds + - `epoch_bits = 2`: 4 microseconds ... - - `epoch_bits = 10`: 1024 milliseconds - - `epoch_bits = 13`: ~8 seconds + - `epoch_bits = 20`: ~1s ...~""" +layout_builtin_reference.label: "Reference layout" +layout_builtin_reference.desc: + """~ + A simplistic layout type that stores all messages from all topics in chronological order in a single stream. + + Not recommended for production use.~""" + +layout_builtin_reference_type.label: "Layout type" +layout_builtin_reference_type.desc: "Reference layout type." + } From 59a09fb86fa4fcadd624a461f2bae2e3df04065a Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Tue, 21 May 2024 22:07:03 +0200 Subject: [PATCH 38/55] fix(ds): Apply review remarks --- .../src/emqx_ds_replication_layer_shard.erl | 6 +++--- .../src/emqx_ds_storage_bitfield_lts.erl | 13 +++++++++++++ changes/ce/fix-13072.en.md | 10 ++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 changes/ce/fix-13072.en.md diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl index 518e1e630..1070fbde0 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl @@ -115,11 +115,11 @@ get_servers_local_preferred(DB, Shard) -> Servers when is_list(Servers) -> ok end, - case lists:keyfind(node(), 2, Servers) of + case lists:keytake(node(), 2, Servers) of false -> Servers; - Local when is_tuple(Local) -> - [Local | lists:delete(Local, Servers)] + {value, Local, Rest} -> + [Local | Rest] end. lookup_leader(DB, Shard) -> diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl index 7d058109e..3b62fbfdf 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_bitfield_lts.erl @@ -553,6 +553,17 @@ delete_next_until( end. handle_event(_ShardId, State = #s{gvars = Gvars}, Time, tick) -> + %% If the last message was published more than one epoch ago, and + %% the shard remains idle, we need to advance safety cutoff + %% interval to make sure the last epoch becomes visible to the + %% readers. + %% + %% We do so by emitting a dummy event that will be persisted by + %% the replication layer. Processing it will advance the + %% replication layer's clock. + %% + %% This operation is latched to avoid publishing events on every + %% tick. case ets:lookup(Gvars, ?IDLE_DETECT) of [{?IDLE_DETECT, Latch, LastWrittenTs}] -> ok; @@ -562,6 +573,8 @@ handle_event(_ShardId, State = #s{gvars = Gvars}, Time, tick) -> end, case Latch of false when ?EPOCH(State, Time) > ?EPOCH(State, LastWrittenTs) + 1 -> + %% Note: + 1 above delays the event by one epoch to add a + %% safety margin. ets:insert(Gvars, {?IDLE_DETECT, true, LastWrittenTs}), [dummy_event]; _ -> diff --git a/changes/ce/fix-13072.en.md b/changes/ce/fix-13072.en.md new file mode 100644 index 000000000..da4a4253f --- /dev/null +++ b/changes/ce/fix-13072.en.md @@ -0,0 +1,10 @@ +Various fixes related to the `durable_sessions` feature: + +- Add an option to execute read operations on the leader. +- `drop_generation` operation can be replayed multiple times by the replication layer, but it's not idempotent. This PR adds a workaround that avoids a crash when `drop_generation` doesn't succeed. In the future, however, we want to make `drop_generation` idempotent in a nicer way. +- Wrap storage layer events in a small structure containing the generation ID, to make sure events are handled by the same layout CBM & context that produced them. +- Fix crash when storage event arrives to the dropped generation (now removed `storage_layer:generation_at` function didn't handle the case of dropped generations). +- Implement `format_status` callback for several workers to minimize log spam +- Move the responsibility of `end_of_stream` detection to the layout CBM. Previously storage layer used a heuristic: old generations that return an empty batch won't produce more data. This was, obviously, incorrect: for example, bitfield-LTS layout MAY return empty batch while waiting for safe cutoff time. +- `reference` layout has been enabled in prod build. It could be useful for integration testing. +- Fix incorrect epoch calculation in `bitfield_lts:handle_event` callback that lead to missed safe cutoff time updates, and effectively, subscribers being unable to fetch messages until a fresh batch was published. From 88c96e26de41199f4fb66f69be2960a04511098f Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Wed, 22 May 2024 18:03:43 +0200 Subject: [PATCH 39/55] refactor: simplify the code with maps:map/2 Thanks @thalesmg for the suggestion --- apps/emqx_resource/src/emqx_resource_manager.erl | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/apps/emqx_resource/src/emqx_resource_manager.erl b/apps/emqx_resource/src/emqx_resource_manager.erl index 7b9e13b16..d650a2afb 100644 --- a/apps/emqx_resource/src/emqx_resource_manager.erl +++ b/apps/emqx_resource/src/emqx_resource_manager.erl @@ -1499,14 +1499,11 @@ maybe_reply(Actions, From, Reply) -> -spec data_record_to_external_map(data()) -> resource_data(). data_record_to_external_map(Data) -> - AddedChannelsList = maps:to_list(Data#data.added_channels), - AddedChannelsListWithoutConfigs = - [ - {ChanID, maps:remove(config, Status)} - || {ChanID, Status} <- AddedChannelsList - ], AddedChannelsWithoutConfigs = - maps:from_list(AddedChannelsListWithoutConfigs), + maps:map( + fun(_ChanID, Status) -> maps:remove(config, Status) end, + Data#data.added_channels + ), #{ id => Data#data.id, error => external_error(Data#data.error), From 40940326494a05ff052276fe7026a064cfa458fd Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 22 May 2024 10:14:55 -0300 Subject: [PATCH 40/55] fix(bridge v2 api): don't attempt to start disabled connector when starting action/source Fixes https://emqx.atlassian.net/browse/EMQX-12435 --- apps/emqx_bridge/src/emqx_bridge_v2_api.erl | 43 ++++++++++++- .../test/emqx_bridge_v2_api_SUITE.erl | 61 +++++++++++++++++-- .../test/emqx_bridge_v2_testlib.erl | 43 +++++++++++++ .../test/emqx_bridge_v2_pgsql_SUITE.erl | 4 ++ changes/ce/fix-13090.en.md | 1 + 5 files changed, 144 insertions(+), 8 deletions(-) create mode 100644 changes/ce/fix-13090.en.md diff --git a/apps/emqx_bridge/src/emqx_bridge_v2_api.erl b/apps/emqx_bridge/src/emqx_bridge_v2_api.erl index e33e1ca07..56b2cb4ed 100644 --- a/apps/emqx_bridge/src/emqx_bridge_v2_api.erl +++ b/apps/emqx_bridge/src/emqx_bridge_v2_api.erl @@ -987,14 +987,45 @@ call_operation_if_enabled(NodeOrAll, OperFunc, [Nodes, ConfRootKey, BridgeType, ?BRIDGE_NOT_FOUND(BridgeType, BridgeName) end. -is_enabled_bridge(ConfRootKey, BridgeType, BridgeName) -> - try emqx_bridge_v2:lookup(ConfRootKey, BridgeType, binary_to_existing_atom(BridgeName)) of +is_enabled_bridge(ConfRootKey, ActionOrSourceType, BridgeName) -> + try + emqx_bridge_v2:lookup(ConfRootKey, ActionOrSourceType, binary_to_existing_atom(BridgeName)) + of {ok, #{raw_config := ConfMap}} -> - maps:get(<<"enable">>, ConfMap, true); + maps:get(<<"enable">>, ConfMap, true) andalso + is_connector_enabled( + ActionOrSourceType, + maps:get(<<"connector">>, ConfMap) + ); {error, not_found} -> throw(not_found) catch error:badarg -> + %% catch non-existing atom, + %% none-existing atom means it is not available in config PT storage. + throw(not_found); + error:{badkey, _} -> + %% `connector' field not present. Should never happen if action/source schema + %% is properly defined. + throw(not_found) + end. + +is_connector_enabled(ActionOrSourceType, ConnectorName0) -> + try + ConnectorType = emqx_bridge_v2:connector_type(ActionOrSourceType), + ConnectorName = to_existing_atom(ConnectorName0), + case emqx_config:get([connectors, ConnectorType, ConnectorName], undefined) of + undefined -> + throw(not_found); + Config = #{} -> + maps:get(enable, Config, true) + end + catch + throw:badarg -> + %% catch non-existing atom, + %% none-existing atom means it is not available in config PT storage. + throw(not_found); + throw:bad_atom -> %% catch non-existing atom, %% none-existing atom means it is not available in config PT storage. throw(not_found) @@ -1407,3 +1438,9 @@ map_to_json(M0) -> M2 = maps:without([value, <<"value">>], M1), emqx_utils_json:encode(M2) end. + +to_existing_atom(X) -> + case emqx_utils:safe_to_existing_atom(X, utf8) of + {ok, A} -> A; + {error, _} -> throw(bad_atom) + end. diff --git a/apps/emqx_bridge/test/emqx_bridge_v2_api_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_v2_api_SUITE.erl index b1e1ac38d..039402738 100644 --- a/apps/emqx_bridge/test/emqx_bridge_v2_api_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_v2_api_SUITE.erl @@ -109,6 +109,7 @@ -define(SOURCE_TYPE_STR, "mqtt"). -define(SOURCE_TYPE, <>). +-define(SOURCE_CONNECTOR_TYPE, ?SOURCE_TYPE). -define(APPSPECS, [ emqx_conf, @@ -166,9 +167,19 @@ init_per_group(single = Group, Config) -> Apps = emqx_cth_suite:start(?APPSPECS ++ [?APPSPEC_DASHBOARD], #{work_dir => WorkDir}), init_api([{group, single}, {group_apps, Apps}, {node, node()} | Config]); init_per_group(actions, Config) -> - [{bridge_kind, action} | Config]; + [ + {bridge_kind, action}, + {connector_type, ?ACTION_CONNECTOR_TYPE}, + {connector_name, ?ACTION_CONNECTOR_NAME} + | Config + ]; init_per_group(sources, Config) -> - [{bridge_kind, source} | Config]; + [ + {bridge_kind, source}, + {connector_type, ?SOURCE_CONNECTOR_TYPE}, + {connector_name, ?SOURCE_CONNECTOR_NAME} + | Config + ]; init_per_group(_Group, Config) -> Config. @@ -202,14 +213,45 @@ end_per_group(single, Config) -> end_per_group(_Group, _Config) -> ok. -init_per_testcase(t_action_types, Config) -> +init_per_testcase(TestCase, Config) when + TestCase =:= t_start_action_or_source_with_disabled_connector; + TestCase =:= t_action_types +-> case ?config(cluster_nodes, Config) of undefined -> init_mocks(); Nodes -> [erpc:call(Node, ?MODULE, init_mocks, []) || Node <- Nodes] end, - Config; + #{ + connector_config := ConnectorConfig, + bridge_type := BridgeType, + bridge_name := BridgeName, + bridge_config := BridgeConfig + } = + case ?config(bridge_kind, Config) of + action -> + #{ + connector_config => ?ACTIONS_CONNECTOR, + bridge_type => {action_type, ?ACTION_TYPE}, + bridge_name => {action_name, ?ACTION_CONNECTOR_NAME}, + bridge_config => {action_config, ?KAFKA_BRIDGE(?ACTION_CONNECTOR_NAME)} + }; + source -> + #{ + connector_config => source_connector_create_config(#{}), + bridge_type => {source_type, ?SOURCE_TYPE}, + bridge_name => {source_name, ?SOURCE_CONNECTOR_NAME}, + bridge_config => {source_config, source_config_base()} + } + end, + [ + {connector_config, ConnectorConfig}, + BridgeType, + BridgeName, + BridgeConfig + | Config + ]; init_per_testcase(_TestCase, Config) -> case ?config(cluster_nodes, Config) of undefined -> @@ -434,7 +476,7 @@ source_connector_create_config(Overrides0) -> source_connector_config_base(), #{ <<"enable">> => true, - <<"type">> => ?SOURCE_TYPE, + <<"type">> => ?SOURCE_CONNECTOR_TYPE, <<"name">> => ?SOURCE_CONNECTOR_NAME } ), @@ -1547,3 +1589,12 @@ t_older_version_nodes_in_cluster(Config) -> ), ok. + +t_start_action_or_source_with_disabled_connector(matrix) -> + [ + [single, actions], + [single, sources] + ]; +t_start_action_or_source_with_disabled_connector(Config) -> + ok = emqx_bridge_v2_testlib:t_start_action_or_source_with_disabled_connector(Config), + ok. diff --git a/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl b/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl index b057a648e..82858f00b 100644 --- a/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl +++ b/apps/emqx_bridge/test/emqx_bridge_v2_testlib.erl @@ -383,6 +383,25 @@ start_connector_api(ConnectorName, ConnectorType) -> ct:pal("connector update (http) result:\n ~p", [Res]), Res. +enable_connector_api(ConnectorType, ConnectorName) -> + do_enable_disable_connector_api(ConnectorType, ConnectorName, enable). + +disable_connector_api(ConnectorType, ConnectorName) -> + do_enable_disable_connector_api(ConnectorType, ConnectorName, disable). + +do_enable_disable_connector_api(ConnectorType, ConnectorName, Op) -> + ConnectorId = emqx_connector_resource:connector_id(ConnectorType, ConnectorName), + {OpPath, OpStr} = + case Op of + enable -> {"true", "enable"}; + disable -> {"false", "disable"} + end, + Path = emqx_mgmt_api_test_util:api_path(["connectors", ConnectorId, "enable", OpPath]), + ct:pal(OpStr ++ " connector ~s (http)", [ConnectorId]), + Res = request(put, Path, []), + ct:pal(OpStr ++ " connector ~s (http) result:\n ~p", [ConnectorId, Res]), + Res. + get_connector_api(ConnectorType, ConnectorName) -> ConnectorId = emqx_connector_resource:connector_id(ConnectorType, ConnectorName), Path = emqx_mgmt_api_test_util:api_path(["connectors", ConnectorId]), @@ -956,3 +975,27 @@ t_on_get_status(Config, Opts) -> ) end, ok. + +%% Verifies that attempting to start an action while its connnector is disabled does not +%% start the connector. +t_start_action_or_source_with_disabled_connector(Config) -> + #{ + kind := Kind, + type := Type, + name := Name, + connector_type := ConnectorType, + connector_name := ConnectorName + } = get_common_values(Config), + ?check_trace( + begin + {ok, _} = create_bridge_api(Config), + {ok, {{_, 204, _}, _, _}} = disable_connector_api(ConnectorType, ConnectorName), + ?assertMatch( + {error, {{_, 400, _}, _, _}}, + op_bridge_api(Kind, "start", Type, Name) + ), + ok + end, + [] + ), + ok. diff --git a/apps/emqx_bridge_pgsql/test/emqx_bridge_v2_pgsql_SUITE.erl b/apps/emqx_bridge_pgsql/test/emqx_bridge_v2_pgsql_SUITE.erl index 561783760..9dff5ab22 100644 --- a/apps/emqx_bridge_pgsql/test/emqx_bridge_v2_pgsql_SUITE.erl +++ b/apps/emqx_bridge_pgsql/test/emqx_bridge_v2_pgsql_SUITE.erl @@ -228,3 +228,7 @@ t_sync_query(Config) -> postgres_bridge_connector_on_query_return ), ok. + +t_start_action_or_source_with_disabled_connector(Config) -> + ok = emqx_bridge_v2_testlib:t_start_action_or_source_with_disabled_connector(Config), + ok. diff --git a/changes/ce/fix-13090.en.md b/changes/ce/fix-13090.en.md new file mode 100644 index 000000000..ee3593036 --- /dev/null +++ b/changes/ce/fix-13090.en.md @@ -0,0 +1 @@ +Attempting to start an action or source whose connector is disabled will no longer attempt to start the connector itself. From 927b1337fa928257ff6e9b6b9736edaf8c783a9f Mon Sep 17 00:00:00 2001 From: zmstone Date: Wed, 22 May 2024 12:03:52 +0200 Subject: [PATCH 41/55] chore: update dashboard version --- Makefile | 2 +- build | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5b458d239..b9ff0dd53 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ endif # Dashboard version # from https://github.com/emqx/emqx-dashboard5 export EMQX_DASHBOARD_VERSION ?= v1.9.0-beta.1 -export EMQX_EE_DASHBOARD_VERSION ?= e1.7.0-beta.4 +export EMQX_EE_DASHBOARD_VERSION ?= e1.7.0-beta.9 PROFILE ?= emqx REL_PROFILES := emqx emqx-enterprise diff --git a/build b/build index 54f5e4db2..bf5c348f9 100755 --- a/build +++ b/build @@ -145,7 +145,7 @@ make_docs() { scripts/merge-i18n.escript | jq --sort-keys . > "$desc" else # it is not a big deal if we cannot generate the desc - log_red "NOT Generated: $desc" + log_red "NOT Generated: $desc due to jq command missing." fi } From 5cad4497de0ff6f3bfa98a490c41eb7221aeacf5 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 22 May 2024 13:49:23 -0300 Subject: [PATCH 42/55] fix(monitor api): add ds connections and subscriptions to old counters Fixes https://emqx.atlassian.net/browse/EMQX-12423 Fixes https://emqx.atlassian.net/browse/EMQX-12267 --- .../src/emqx_dashboard_monitor.erl | 27 ++++++++++++++-- .../src/emqx_dashboard_monitor_api.erl | 17 +++++++++- .../test/emqx_dashboard_monitor_SUITE.erl | 32 +++++++++++++------ 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl b/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl index d3b533ebf..8d6a1dbc9 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl @@ -264,8 +264,8 @@ merge_cluster_rate(Node, Cluster) -> %% cluster-synced values (disconnected_durable_sessions, V, NCluster) -> NCluster#{disconnected_durable_sessions => V}; - (durable_subscriptions, V, NCluster) -> - NCluster#{durable_subscriptions => V}; + (subscriptions_durable, V, NCluster) -> + NCluster#{subscriptions_durable => V}; (topics, V, NCluster) -> NCluster#{topics => V}; (retained_msg_count, V, NCluster) -> @@ -281,7 +281,28 @@ merge_cluster_rate(Node, Cluster) -> ClusterValue = maps:get(Key, NCluster, 0), NCluster#{Key => Value + ClusterValue} end, - maps:fold(Fun, Cluster, Node). + Metrics = maps:fold(Fun, Cluster, Node), + adjust_synthetic_cluster_metrics(Metrics). + +adjust_synthetic_cluster_metrics(Metrics0) -> + %% ensure renamed + Metrics1 = emqx_utils_maps:rename(durable_subscriptions, subscriptions_durable, Metrics0), + DSSubs = maps:get(subscriptions_durable, Metrics1, 0), + RamSubs = maps:get(subscriptions, Metrics1, 0), + DisconnectedDSs = maps:get(disconnected_durable_sessions, Metrics1, 0), + Metrics2 = maps:update_with( + subscriptions, + fun(Subs) -> Subs + DSSubs end, + 0, + Metrics1 + ), + Metrics = maps:put(subscriptions_ram, RamSubs, Metrics2), + maps:update_with( + connections, + fun(RamConns) -> RamConns + DisconnectedDSs end, + DisconnectedDSs, + Metrics + ). format({badrpc, Reason}) -> {badrpc, Reason}; diff --git a/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl b/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl index c8bb9c8be..3d9536299 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl @@ -19,6 +19,7 @@ -include("emqx_dashboard.hrl"). -include_lib("typerefl/include/types.hrl"). -include_lib("hocon/include/hocon_types.hrl"). +-include_lib("emqx_utils/include/emqx_utils_api.hrl"). -behaviour(minirest_api). @@ -159,7 +160,12 @@ dashboard_samplers_fun(Latest) -> monitor_current(get, #{bindings := Bindings}) -> RawNode = maps:get(node, Bindings, <<"all">>), - emqx_utils_api:with_node_or_cluster(RawNode, fun current_rate/1). + case emqx_utils_api:with_node_or_cluster(RawNode, fun current_rate/1) of + ?OK(Rates) -> + ?OK(maybe_reject_cluster_only_metrics(RawNode, Rates)); + Error -> + Error + end. -spec current_rate(atom()) -> {error, term()} @@ -242,3 +248,12 @@ swagger_desc_format(Format) -> swagger_desc_format(Format, Type) -> Interval = emqx_conf:get([dashboard, monitor, interval], ?DEFAULT_SAMPLE_INTERVAL), list_to_binary(io_lib:format(Format ++ "~p ~p seconds", [Type, Interval])). + +maybe_reject_cluster_only_metrics(<<"all">>, Rates) -> + Rates; +maybe_reject_cluster_only_metrics(_Node, Rates) -> + ClusterOnlyMetrics = [ + durable_subscriptions, + disconnected_durable_sessions + ], + maps:without(ClusterOnlyMetrics, Rates). diff --git a/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl b/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl index 59951faa9..8286810db 100644 --- a/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl +++ b/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl @@ -196,13 +196,22 @@ t_monitor_current_api(_) -> {ok, Rate} = request(["monitor_current"]), [ ?assert(maps:is_key(atom_to_binary(Key, utf8), Rate)) - || Key <- maps:values(?DELTA_SAMPLER_RATE_MAP) ++ ?GAUGE_SAMPLER_LIST + || Key <- maps:values(?DELTA_SAMPLER_RATE_MAP) ++ ?GAUGE_SAMPLER_LIST, + %% We rename `durable_subscriptions' key. + Key =/= durable_subscriptions ], + ?assert(maps:is_key(<<"subscriptions_durable">>, Rate)), + ?assert(maps:is_key(<<"disconnected_durable_sessions">>, Rate)), + ClusterOnlyMetrics = [durable_subscriptions, disconnected_durable_sessions], {ok, NodeRate} = request(["monitor_current", "nodes", node()]), [ - ?assert(maps:is_key(atom_to_binary(Key, utf8), NodeRate)) - || Key <- maps:values(?DELTA_SAMPLER_RATE_MAP) ++ ?GAUGE_SAMPLER_LIST + ?assert(maps:is_key(atom_to_binary(Key, utf8), NodeRate), #{key => Key, rates => NodeRate}) + || Key <- maps:values(?DELTA_SAMPLER_RATE_MAP) ++ ?GAUGE_SAMPLER_LIST, + not lists:member(Key, ClusterOnlyMetrics) ], + ?assertNot(maps:is_key(<<"subscriptions_durable">>, NodeRate)), + ?assertNot(maps:is_key(<<"subscriptions_ram">>, NodeRate)), + ?assertNot(maps:is_key(<<"disconnected_durable_sessions">>, NodeRate)), ok. t_monitor_current_api_live_connections(_) -> @@ -290,8 +299,11 @@ t_monitor_reset(_) -> {ok, Rate} = request(["monitor_current"]), [ ?assert(maps:is_key(atom_to_binary(Key, utf8), Rate)) - || Key <- maps:values(?DELTA_SAMPLER_RATE_MAP) ++ ?GAUGE_SAMPLER_LIST + || Key <- maps:values(?DELTA_SAMPLER_RATE_MAP) ++ ?GAUGE_SAMPLER_LIST, + %% We rename `durable_subscriptions' key. + Key =/= durable_subscriptions ], + ?assert(maps:is_key(<<"subscriptions_durable">>, Rate)), {ok, _} = snabbkaffe:block_until( ?match_n_events(1, #{?snk_kind := dashboard_monitor_flushed}), @@ -347,8 +359,9 @@ t_persistent_session_stats(_Config) -> %% and non-persistent routes, so we count `commont/topic' twice and get 8 %% instead of 6 here. <<"topics">> := 8, - <<"durable_subscriptions">> := 4, - <<"subscriptions">> := 4 + <<"subscriptions">> := 8, + <<"subscriptions_ram">> := 4, + <<"subscriptions_durable">> := 4 }}, request(["monitor_current"]) ) @@ -368,14 +381,15 @@ t_persistent_session_stats(_Config) -> ?retry(1_000, 10, begin ?assertMatch( {ok, #{ - <<"connections">> := 1, + <<"connections">> := 2, <<"disconnected_durable_sessions">> := 1, %% N.B.: we currently don't perform any deduplication between persistent %% and non-persistent routes, so we count `commont/topic' twice and get 8 %% instead of 6 here. <<"topics">> := 8, - <<"durable_subscriptions">> := 4, - <<"subscriptions">> := 4 + <<"subscriptions">> := 8, + <<"subscriptions_ram">> := 4, + <<"subscriptions_durable">> := 4 }}, request(["monitor_current"]) ) From 6eb04f90a3c4077a9a60a0c1fdbd03ecac7fbf7b Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Wed, 22 May 2024 20:28:16 +0200 Subject: [PATCH 43/55] fix(ds): Allow to write batches to older generations --- .../src/emqx_ds_storage_layer.erl | 53 +++++++++++++------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 7d1fffbcb..2245e81c5 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -301,26 +301,30 @@ store_batch(Shard, Messages, Options) -> [{emqx_ds:time(), emqx_types:message()}], emqx_ds:message_store_opts() ) -> {ok, cooked_batch()} | ignore | emqx_ds:error(_). -prepare_batch(Shard, Messages = [_ | _], Options) -> +prepare_batch(Shard, Messages = [{Time, _} | _], Options) -> %% NOTE %% We assume that batches do not span generations. Callers should enforce this. ?tp(emqx_ds_storage_layer_prepare_batch, #{ shard => Shard, messages => Messages, options => Options }), - GenId = generation_current(Shard), - #{module := Mod, data := GenData} = generation_get(Shard, GenId), - T0 = erlang:monotonic_time(microsecond), - Result = - case Mod:prepare_batch(Shard, GenData, Messages, Options) of - {ok, CookedBatch} -> - {ok, #{?tag => ?COOKED_BATCH, ?generation => GenId, ?enc => CookedBatch}}; - Error = {error, _, _} -> - Error - end, - T1 = erlang:monotonic_time(microsecond), - %% TODO store->prepare - emqx_ds_builtin_metrics:observe_store_batch_time(Shard, T1 - T0), - Result; + %% FIXME: always store messages in the current generation + case generation_at(Shard, Time) of + {GenId, #{module := Mod, data := GenData}} -> + T0 = erlang:monotonic_time(microsecond), + Result = + case Mod:prepare_batch(Shard, GenData, Messages, Options) of + {ok, CookedBatch} -> + {ok, #{?tag => ?COOKED_BATCH, ?generation => GenId, ?enc => CookedBatch}}; + Error = {error, _, _} -> + Error + end, + T1 = erlang:monotonic_time(microsecond), + %% TODO store->prepare + emqx_ds_builtin_metrics:observe_store_batch_time(Shard, T1 - T0), + Result; + not_found -> + ignore + end; prepare_batch(_Shard, [], _Options) -> ignore. @@ -964,6 +968,25 @@ generation_current(Shard) -> #{current_generation := Current} = get_schema_runtime(Shard), Current. +%% TODO: remove me +-spec generation_at(shard_id(), emqx_ds:time()) -> {gen_id(), generation()} | not_found. +generation_at(Shard, Time) -> + Schema = #{current_generation := Current} = get_schema_runtime(Shard), + generation_at(Time, Current, Schema). + +generation_at(Time, GenId, Schema) -> + case Schema of + #{?GEN_KEY(GenId) := Gen} -> + case Gen of + #{since := Since} when Time < Since andalso GenId > 0 -> + generation_at(Time, GenId - 1, Schema); + _ -> + {GenId, Gen} + end; + _ -> + not_found + end. + -spec generation_get(shard_id(), gen_id()) -> generation() | not_found. generation_get(Shard, GenId) -> case get_schema_runtime(Shard) of From 07774ab060795c921cdb3797073349a340755fef Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Tue, 21 May 2024 21:52:19 +0200 Subject: [PATCH 44/55] feat(sessds): Handle unrecoverable errors --- apps/emqx/src/emqx_persistent_session_ds.erl | 48 +++++++++++++++++--- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 923e17aaf..4bc6b4183 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -629,8 +629,10 @@ replay_streams(Session0 = #{replay := [{StreamKey, Srs0} | Rest]}, ClientInfo) - class => recoverable, retry_in_ms => RetryTimeout }), - emqx_session:ensure_timer(?TIMER_RETRY_REPLAY, RetryTimeout, Session0) - %% TODO: Handle unrecoverable errors. + emqx_session:ensure_timer(?TIMER_RETRY_REPLAY, RetryTimeout, Session0); + {error, unrecoverable, Reason} -> + Session1 = skip_batch(StreamKey, Srs0, Session0, ClientInfo, Reason), + replay_streams(Session1#{replay := Rest}, ClientInfo) end; replay_streams(Session0 = #{replay := []}, _ClientInfo) -> Session = maps:remove(replay, Session0), @@ -655,6 +657,39 @@ replay_batch(Srs0, Session0, ClientInfo) -> Error end. +%% Handle `{error, unrecoverable, _}' returned by `enqueue_batch'. +%% Most likely they mean that the generation containing the messages +%% has been removed. +-spec skip_batch(_StreamKey, stream_state(), session(), clientinfo(), _Reason) -> session(). +skip_batch(StreamKey, SRS0, Session = #{s := S0}, ClientInfo, Reason) -> + ?SLOG(info, #{ + msg => "session_ds_replay_unrecoverable_error", + reason => Reason, + srs => SRS0 + }), + GenEvents = fun + F(QoS, SeqNo, LastSeqNo) when SeqNo < LastSeqNo -> + FakeMsg = #message{ + id = <<>>, + qos = QoS, + payload = <<>>, + topic = <<>>, + timestamp = 0 + }, + _ = emqx_session_events:handle_event(ClientInfo, {expired, FakeMsg}), + F(QoS, inc_seqno(QoS, SeqNo), LastSeqNo); + F(_, _, _) -> + ok + end, + %% Treat messages as expired: + GenEvents(?QOS_1, SRS0#srs.first_seqno_qos1, SRS0#srs.last_seqno_qos1), + GenEvents(?QOS_2, SRS0#srs.first_seqno_qos2, SRS0#srs.last_seqno_qos2), + SRS = SRS0#srs{it_end = end_of_stream, batch_size = 0}, + %% That's it for the iterator. Mark SRS as reached the + %% `end_of_stream', and let stream scheduler do the rest: + S = emqx_persistent_session_ds_state:put_stream(StreamKey, SRS, S0), + Session#{s := S}. + %%-------------------------------------------------------------------- -spec disconnect(session(), emqx_types:conninfo()) -> {shutdown, session()}. @@ -923,15 +958,16 @@ new_batch({StreamKey, Srs0}, BatchSize, Session0 = #{s := S0}, ClientInfo) -> ), S = emqx_persistent_session_ds_state:put_stream(StreamKey, Srs, S2), Session#{s => S}; - {error, Class, Reason} -> - %% TODO: Handle unrecoverable error. + {error, recoverable, Reason} -> ?SLOG(debug, #{ msg => "failed_to_fetch_batch", stream => StreamKey, reason => Reason, - class => Class + class => recoverable }), - Session0 + Session0; + {error, unrecoverable, Reason} -> + skip_batch(StreamKey, Srs1, Session0, ClientInfo, Reason) end. enqueue_batch(IsReplay, BatchSize, Srs0, Session = #{inflight := Inflight0, s := S}, ClientInfo) -> From e10c87b825ace3b9778e3f577ac5563152181001 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Thu, 23 May 2024 10:25:27 +0200 Subject: [PATCH 45/55] fix: add action rendered trace for s3 in aggregated mode Fixes: https://emqx.atlassian.net/browse/EMQX-12429 --- .../src/emqx_bridge_s3_connector.erl | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl b/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl index 88b161033..dd0362e5d 100644 --- a/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl +++ b/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl @@ -310,7 +310,7 @@ on_query(InstId, {Tag, Data}, #{client_config := Config, channels := Channels}) ChannelState = #{mode := direct} -> run_simple_upload(InstId, Tag, Data, ChannelState, Config); ChannelState = #{mode := aggregated} -> - run_aggregated_upload(InstId, [Data], ChannelState); + run_aggregated_upload(InstId, Tag, [Data], ChannelState); undefined -> {error, {unrecoverable_error, {invalid_message_tag, Tag}}} end. @@ -321,7 +321,7 @@ on_batch_query(InstId, [{Tag, Data0} | Rest], #{channels := Channels}) -> case maps:get(Tag, Channels, undefined) of ChannelState = #{mode := aggregated} -> Records = [Data0 | [Data || {_, Data} <- Rest]], - run_aggregated_upload(InstId, Records, ChannelState); + run_aggregated_upload(InstId, Tag, Records, ChannelState); undefined -> {error, {unrecoverable_error, {invalid_message_tag, Tag}}} end. @@ -362,8 +362,15 @@ run_simple_upload( {error, map_error(Reason)} end. -run_aggregated_upload(InstId, Records, #{aggreg_id := AggregId}) -> +run_aggregated_upload(InstId, ChannelID, Records, #{aggreg_id := AggregId}) -> Timestamp = erlang:system_time(second), + emqx_trace:rendered_action_template(ChannelID, #{ + mode => aggregated, + records => #emqx_trace_format_func_data{ + function = fun render_records/1, + data = Records + } + }), case emqx_connector_aggregator:push_records(AggregId, Timestamp, Records) of ok -> ?tp(s3_bridge_aggreg_push_ok, #{instance_id => InstId, name => AggregId}), @@ -372,6 +379,13 @@ run_aggregated_upload(InstId, Records, #{aggreg_id := AggregId}) -> {error, {unrecoverable_error, Reason}} end. +render_records(Records) -> + try + [unicode:characters_to_binary(R) || R <- Records] + catch + _:_ -> Records + end. + map_error({socket_error, _} = Reason) -> {recoverable_error, Reason}; map_error(Reason = {aws_error, Status, _, _Body}) when Status >= 500 -> From fb7688ab94dcb04411441199a5c759e1d21be2b2 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Thu, 23 May 2024 12:30:56 +0200 Subject: [PATCH 46/55] fix(trace): make sure that the payload encode works with nested payloads This commit makes sure that the trace setting for payload encode works even when the payload is in a nested structure or when the payload key is a binary instead of an atom. Fixes: https://emqx.atlassian.net/browse/EMQX-12424 --- .../src/emqx_trace/emqx_trace_formatter.erl | 20 ++++++++++-- .../emqx_trace/emqx_trace_json_formatter.erl | 22 +++++++++---- .../emqx_rule_engine_api_rule_apply_SUITE.erl | 32 ++++++++++++------- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/apps/emqx/src/emqx_trace/emqx_trace_formatter.erl b/apps/emqx/src/emqx_trace/emqx_trace_formatter.erl index c1213459c..34469f835 100644 --- a/apps/emqx/src/emqx_trace/emqx_trace_formatter.erl +++ b/apps/emqx/src/emqx_trace/emqx_trace_formatter.erl @@ -48,7 +48,11 @@ format_meta_map(Meta) -> format_meta_map(Meta, Encode). format_meta_map(Meta, Encode) -> - format_meta_map(Meta, Encode, [{packet, fun format_packet/2}, {payload, fun format_payload/2}]). + format_meta_map(Meta, Encode, [ + {packet, fun format_packet/2}, + {payload, fun format_payload/2}, + {<<"payload">>, fun format_payload/2} + ]). format_meta_map(Meta, _Encode, []) -> Meta; @@ -61,9 +65,21 @@ format_meta_map(Meta, Encode, [{Name, FormatFun} | Rest]) -> format_meta_map(Meta, Encode, Rest) end. +format_meta_data(Meta0, Encode) when is_map(Meta0) -> + Meta1 = format_meta_map(Meta0, Encode), + maps:map(fun(_K, V) -> format_meta_data(V, Encode) end, Meta1); +format_meta_data(Meta, Encode) when is_list(Meta) -> + [format_meta_data(Item, Encode) || Item <- Meta]; +format_meta_data(Meta, Encode) when is_tuple(Meta) -> + List = erlang:tuple_to_list(Meta), + FormattedList = [format_meta_data(Item, Encode) || Item <- List], + erlang:list_to_tuple(FormattedList); +format_meta_data(Meta, _Encode) -> + Meta. + format_meta(Meta0, Encode) -> Meta1 = maps:without([msg, clientid, peername, trace_tag], Meta0), - Meta2 = format_meta_map(Meta1, Encode), + Meta2 = format_meta_data(Meta1, Encode), kvs_to_iolist(lists:sort(fun compare_meta_kvs/2, maps:to_list(Meta2))). %% packet always goes first; payload always goes last diff --git a/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl b/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl index 85b846349..e06473650 100644 --- a/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl +++ b/apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl @@ -42,7 +42,7 @@ format( %% an external call to create the JSON text Time = emqx_utils_calendar:now_to_rfc3339(microsecond), LogMap2 = LogMap1#{time => Time}, - LogMap3 = prepare_log_map(LogMap2, PEncode), + LogMap3 = prepare_log_data(LogMap2, PEncode), [emqx_logger_jsonfmt:best_effort_json(LogMap3, [force_utf8]), "\n"]. %%%----------------------------------------------------------------- @@ -85,9 +85,17 @@ do_maybe_format_msg({report, Report} = Msg, #{report_cb := Cb} = Meta, Config) - do_maybe_format_msg(Msg, Meta, Config) -> emqx_logger_jsonfmt:format_msg(Msg, Meta, Config). -prepare_log_map(LogMap, PEncode) -> +prepare_log_data(LogMap, PEncode) when is_map(LogMap) -> NewKeyValuePairs = [prepare_key_value(K, V, PEncode) || {K, V} <- maps:to_list(LogMap)], - maps:from_list(NewKeyValuePairs). + maps:from_list(NewKeyValuePairs); +prepare_log_data(V, PEncode) when is_list(V) -> + [prepare_log_data(Item, PEncode) || Item <- V]; +prepare_log_data(V, PEncode) when is_tuple(V) -> + List = erlang:tuple_to_list(V), + PreparedList = [prepare_log_data(Item, PEncode) || Item <- List], + erlang:list_to_tuple(PreparedList); +prepare_log_data(V, _PEncode) -> + V. prepare_key_value(host, {I1, I2, I3, I4} = IP, _PEncode) when is_integer(I1), @@ -118,6 +126,8 @@ prepare_key_value(payload = K, V, PEncode) -> V end, {K, NewV}; +prepare_key_value(<<"payload">>, V, PEncode) -> + prepare_key_value(payload, V, PEncode); prepare_key_value(packet = K, V, PEncode) -> NewV = try @@ -167,10 +177,8 @@ prepare_key_value(action_id = K, V, _PEncode) -> _:_ -> {K, V} end; -prepare_key_value(K, V, PEncode) when is_map(V) -> - {K, prepare_log_map(V, PEncode)}; -prepare_key_value(K, V, _PEncode) -> - {K, V}. +prepare_key_value(K, V, PEncode) -> + {K, prepare_log_data(V, PEncode)}. format_packet(undefined, _) -> ""; format_packet(Packet, Encode) -> emqx_packet:format(Packet, Encode). diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl index a7a464842..0f0395013 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_apply_SUITE.erl @@ -91,13 +91,16 @@ end_per_testcase(_TestCase, _Config) -> ok. t_basic_apply_rule_trace_ruleid(Config) -> - basic_apply_rule_test_helper(get_action(Config), ruleid, false). + basic_apply_rule_test_helper(get_action(Config), ruleid, false, text). + +t_basic_apply_rule_trace_ruleid_hidden_payload(Config) -> + basic_apply_rule_test_helper(get_action(Config), ruleid, false, hidden). t_basic_apply_rule_trace_clientid(Config) -> - basic_apply_rule_test_helper(get_action(Config), clientid, false). + basic_apply_rule_test_helper(get_action(Config), clientid, false, text). t_basic_apply_rule_trace_ruleid_stop_after_render(Config) -> - basic_apply_rule_test_helper(get_action(Config), ruleid, true). + basic_apply_rule_test_helper(get_action(Config), ruleid, true, text). get_action(Config) -> case ?config(group_name, Config) of @@ -135,10 +138,10 @@ republish_action() -> console_print_action() -> #{<<"function">> => <<"console">>}. -basic_apply_rule_test_helper(Action, TraceType, StopAfterRender) -> +basic_apply_rule_test_helper(Action, TraceType, StopAfterRender, PayloadEncode) -> %% Create Rule RuleTopic = iolist_to_binary([<<"my_rule_topic/">>, atom_to_binary(?FUNCTION_NAME)]), - SQL = <<"SELECT payload.id as id FROM \"", RuleTopic/binary, "\"">>, + SQL = <<"SELECT payload.id as id, payload as payload FROM \"", RuleTopic/binary, "\"">>, {ok, #{<<"id">> := RuleId}} = emqx_bridge_testlib:create_rule_and_action( Action, @@ -157,12 +160,12 @@ basic_apply_rule_test_helper(Action, TraceType, StopAfterRender) -> clientid -> ClientId end, - create_trace(TraceName, TraceType, TraceValue), + create_trace(TraceName, TraceType, TraceValue, PayloadEncode), %% =================================== Context = #{ clientid => ClientId, event_type => message_publish, - payload => <<"{\"msg\": \"hello\"}">>, + payload => <<"{\"msg\": \"my_payload_msg\"}">>, qos => 1, topic => RuleTopic, username => <<"u_emqx">> @@ -179,6 +182,12 @@ basic_apply_rule_test_helper(Action, TraceType, StopAfterRender) -> begin Bin = read_rule_trace_file(TraceName, TraceType, Now), io:format("THELOG:~n~s", [Bin]), + case PayloadEncode of + hidden -> + ?assertEqual(nomatch, binary:match(Bin, [<<"my_payload_msg">>])); + text -> + ?assertNotEqual(nomatch, binary:match(Bin, [<<"my_payload_msg">>])) + end, ?assertNotEqual(nomatch, binary:match(Bin, [<<"rule_activated">>])), ?assertNotEqual(nomatch, binary:match(Bin, [<<"SQL_yielded_result">>])), case Action of @@ -273,7 +282,7 @@ do_final_log_check(Action, Bin0) when is_binary(Action) -> do_final_log_check(_, _) -> ok. -create_trace(TraceName, TraceType, TraceValue) -> +create_trace(TraceName, TraceType, TraceValue, PayloadEncode) -> Now = erlang:system_time(second) - 10, Start = Now, End = Now + 60, @@ -283,7 +292,8 @@ create_trace(TraceName, TraceType, TraceValue) -> TraceType => TraceValue, start_at => Start, end_at => End, - formatter => json + formatter => json, + payload_encode => PayloadEncode }, {ok, _} = CreateRes = emqx_trace:create(Trace), emqx_common_test_helpers:on_exit(fun() -> @@ -323,7 +333,7 @@ t_apply_rule_test_batch_separation_stop_after_render(_Config) -> ?FUNCTION_NAME, SQL ), - create_trace(Name, ruleid, RuleID), + create_trace(Name, ruleid, RuleID, text), Now = erlang:system_time(second) - 10, %% Stop ParmsStopAfterRender = apply_rule_parms(true, Name), @@ -588,7 +598,7 @@ do_apply_rule_test_format_action_failed_test(BatchSize, CheckLastTraceEntryFun) ?FUNCTION_NAME, SQL ), - create_trace(Name, ruleid, RuleID), + create_trace(Name, ruleid, RuleID, text), Now = erlang:system_time(second) - 10, %% Stop ParmsNoStopAfterRender = apply_rule_parms(false, Name), From 89b47e8ffccbeff5b9149f3fc2ab9866d4e611fa Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Thu, 23 May 2024 12:41:30 +0200 Subject: [PATCH 47/55] fix(s3 tracing): do not format records as IO data in aggregate mode --- apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl b/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl index dd0362e5d..bc9f37935 100644 --- a/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl +++ b/apps/emqx_bridge_s3/src/emqx_bridge_s3_connector.erl @@ -366,10 +366,7 @@ run_aggregated_upload(InstId, ChannelID, Records, #{aggreg_id := AggregId}) -> Timestamp = erlang:system_time(second), emqx_trace:rendered_action_template(ChannelID, #{ mode => aggregated, - records => #emqx_trace_format_func_data{ - function = fun render_records/1, - data = Records - } + records => Records }), case emqx_connector_aggregator:push_records(AggregId, Timestamp, Records) of ok -> @@ -379,13 +376,6 @@ run_aggregated_upload(InstId, ChannelID, Records, #{aggreg_id := AggregId}) -> {error, {unrecoverable_error, Reason}} end. -render_records(Records) -> - try - [unicode:characters_to_binary(R) || R <- Records] - catch - _:_ -> Records - end. - map_error({socket_error, _} = Reason) -> {recoverable_error, Reason}; map_error(Reason = {aws_error, Status, _, _Body}) when Status >= 500 -> From 065411c69c87297ce30ea3eb67803f1f511f382e Mon Sep 17 00:00:00 2001 From: JimMoen Date: Thu, 23 May 2024 20:04:38 +0800 Subject: [PATCH 48/55] fix: add ds metrics into prometheus zipper init acc --- apps/emqx_prometheus/src/emqx_prometheus.erl | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/apps/emqx_prometheus/src/emqx_prometheus.erl b/apps/emqx_prometheus/src/emqx_prometheus.erl index 667af1a30..82e60d4bb 100644 --- a/apps/emqx_prometheus/src/emqx_prometheus.erl +++ b/apps/emqx_prometheus/src/emqx_prometheus.erl @@ -294,7 +294,7 @@ fetch_cluster_consistented_data() -> }. aggre_or_zip_init_acc() -> - #{ + (maybe_add_ds_meta())#{ stats_data => maps:from_keys(metrics_name(stats_metric_meta()), []), vm_data => maps:from_keys(metrics_name(vm_metric_meta()), []), cluster_data => maps:from_keys(metrics_name(cluster_metric_meta()), []), @@ -656,6 +656,20 @@ emqx_metric_data(MetricNameTypeKeyL, Mode) -> MetricNameTypeKeyL ). +%%========== +%% Durable Storage +maybe_add_ds_meta() -> + case emqx_persistent_message:is_persistence_enabled() of + true -> + #{ + ds_data => maps:from_keys( + metrics_name(emqx_ds_builtin_metrics:prometheus_meta()), [] + ) + }; + false -> + #{} + end. + %%========== %% Bytes && Packets emqx_packet_metric_meta() -> From b91ecac16887f617e0fe0cb70cc8616d982b304d Mon Sep 17 00:00:00 2001 From: JimMoen Date: Thu, 23 May 2024 20:09:14 +0800 Subject: [PATCH 49/55] refactor(prom): unify function name to avoid multiple levels call --- apps/emqx_prometheus/src/emqx_prometheus.erl | 31 ++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/apps/emqx_prometheus/src/emqx_prometheus.erl b/apps/emqx_prometheus/src/emqx_prometheus.erl index 82e60d4bb..f4d0ff2c0 100644 --- a/apps/emqx_prometheus/src/emqx_prometheus.erl +++ b/apps/emqx_prometheus/src/emqx_prometheus.erl @@ -295,18 +295,18 @@ fetch_cluster_consistented_data() -> aggre_or_zip_init_acc() -> (maybe_add_ds_meta())#{ - stats_data => maps:from_keys(metrics_name(stats_metric_meta()), []), - vm_data => maps:from_keys(metrics_name(vm_metric_meta()), []), - cluster_data => maps:from_keys(metrics_name(cluster_metric_meta()), []), - emqx_packet_data => maps:from_keys(metrics_name(emqx_packet_metric_meta()), []), - emqx_message_data => maps:from_keys(metrics_name(message_metric_meta()), []), - emqx_delivery_data => maps:from_keys(metrics_name(delivery_metric_meta()), []), - emqx_client_data => maps:from_keys(metrics_name(client_metric_meta()), []), - emqx_session_data => maps:from_keys(metrics_name(session_metric_meta()), []), - emqx_olp_data => maps:from_keys(metrics_name(olp_metric_meta()), []), - emqx_acl_data => maps:from_keys(metrics_name(acl_metric_meta()), []), - emqx_authn_data => maps:from_keys(metrics_name(authn_metric_meta()), []), - mria_data => maps:from_keys(metrics_name(mria_metric_meta()), []) + stats_data => meta_to_init_from(stats_metric_meta()), + vm_data => meta_to_init_from(vm_metric_meta()), + cluster_data => meta_to_init_from(cluster_metric_meta()), + emqx_packet_data => meta_to_init_from(emqx_packet_metric_meta()), + emqx_message_data => meta_to_init_from(message_metric_meta()), + emqx_delivery_data => meta_to_init_from(delivery_metric_meta()), + emqx_client_data => meta_to_init_from(client_metric_meta()), + emqx_session_data => meta_to_init_from(session_metric_meta()), + emqx_olp_data => meta_to_init_from(olp_metric_meta()), + emqx_acl_data => meta_to_init_from(acl_metric_meta()), + emqx_authn_data => meta_to_init_from(authn_metric_meta()), + mria_data => meta_to_init_from(mria_metric_meta()) }. logic_sum_metrics() -> @@ -662,9 +662,7 @@ maybe_add_ds_meta() -> case emqx_persistent_message:is_persistence_enabled() of true -> #{ - ds_data => maps:from_keys( - metrics_name(emqx_ds_builtin_metrics:prometheus_meta()), [] - ) + ds_data => meta_to_init_from(emqx_ds_builtin_metrics:prometheus_meta()) }; false -> #{} @@ -1130,6 +1128,9 @@ zip_json_prom_stats_metrics(Key, Points, AllResultedAcc) -> ThisKeyResult = lists:foldl(emqx_prometheus_cluster:point_to_map_fun(Key), [], Points), lists:zipwith(fun maps:merge/2, AllResultedAcc, ThisKeyResult). +meta_to_init_from(Meta) -> + maps:from_keys(metrics_name(Meta), []). + metrics_name(MetricsAll) -> [Name || {Name, _, _} <- MetricsAll]. From bf326acd7bfcd4d154e06062e8d312b529a54efc Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Thu, 23 May 2024 14:34:27 +0200 Subject: [PATCH 50/55] fix(dsrepl): handle stopping non-yet-running shard supervisor --- .../src/emqx_ds_builtin_db_sup.erl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl b/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl index b2a461e7a..747f8241f 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl @@ -79,11 +79,15 @@ start_shard({DB, Shard}) -> start_egress({DB, Shard}) -> supervisor:start_child(?via(#?egress_sup{db = DB}), egress_spec(DB, Shard)). --spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok. +-spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, not_found}. stop_shard({DB, Shard}) -> Sup = ?via(#?shards_sup{db = DB}), - ok = supervisor:terminate_child(Sup, Shard), - ok = supervisor:delete_child(Sup, Shard). + case supervisor:terminate_child(Sup, Shard) of + ok -> + supervisor:delete_child(Sup, Shard); + {error, Reason} -> + {error, Reason} + end. -spec terminate_storage(emqx_ds_storage_layer:shard_id()) -> ok | {error, _Reason}. terminate_storage({DB, Shard}) -> From c0e3a81c6129d9320a1c478008798b96783b84be Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Thu, 23 May 2024 09:31:00 -0300 Subject: [PATCH 51/55] fix(monitor api): fix cluster metric aggregation Fixes https://emqx.atlassian.net/browse/EMQX-12423 Fixes https://emqx.atlassian.net/browse/EMQX-12267 --- apps/emqx/test/emqx_cth_cluster.erl | 1 + .../src/emqx_dashboard_monitor.erl | 27 ++-- .../src/emqx_dashboard_monitor_api.erl | 2 +- .../test/emqx_dashboard_monitor_SUITE.erl | 115 ++++++++++++++---- 4 files changed, 105 insertions(+), 40 deletions(-) diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index 28ee1f30f..f3c5d97f9 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -109,6 +109,7 @@ start(Nodes, ClusterOpts) -> start(NodeSpecs). start(NodeSpecs) -> + emqx_common_test_helpers:clear_screen(), ct:pal("(Re)starting nodes:\n ~p", [NodeSpecs]), % 1. Start bare nodes with only basic applications running ok = start_nodes_init(NodeSpecs, ?TIMEOUT_NODE_START_MS), diff --git a/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl b/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl index 8d6a1dbc9..a11d875f3 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_monitor.erl @@ -119,7 +119,7 @@ current_rate(all) -> current_rate(Node) when Node == node() -> try {ok, Rate} = do_call(current_rate), - {ok, Rate} + {ok, adjust_individual_node_metrics(Rate)} catch _E:R -> ?SLOG(warning, #{msg => "dashboard_monitor_error", reason => R}), @@ -156,8 +156,8 @@ current_rate_cluster() -> case lists:foldl(Fun, #{}, mria:cluster_nodes(running)) of {badrpc, Reason} -> {badrpc, Reason}; - Rate -> - {ok, Rate} + Metrics -> + {ok, adjust_synthetic_cluster_metrics(Metrics)} end. %% ------------------------------------------------------------------------------------------------- @@ -281,22 +281,23 @@ merge_cluster_rate(Node, Cluster) -> ClusterValue = maps:get(Key, NCluster, 0), NCluster#{Key => Value + ClusterValue} end, - Metrics = maps:fold(Fun, Cluster, Node), - adjust_synthetic_cluster_metrics(Metrics). + maps:fold(Fun, Cluster, Node). + +adjust_individual_node_metrics(Metrics0) -> + %% ensure renamed + emqx_utils_maps:rename(durable_subscriptions, subscriptions_durable, Metrics0). adjust_synthetic_cluster_metrics(Metrics0) -> - %% ensure renamed - Metrics1 = emqx_utils_maps:rename(durable_subscriptions, subscriptions_durable, Metrics0), - DSSubs = maps:get(subscriptions_durable, Metrics1, 0), - RamSubs = maps:get(subscriptions, Metrics1, 0), - DisconnectedDSs = maps:get(disconnected_durable_sessions, Metrics1, 0), - Metrics2 = maps:update_with( + DSSubs = maps:get(subscriptions_durable, Metrics0, 0), + RamSubs = maps:get(subscriptions, Metrics0, 0), + DisconnectedDSs = maps:get(disconnected_durable_sessions, Metrics0, 0), + Metrics1 = maps:update_with( subscriptions, fun(Subs) -> Subs + DSSubs end, 0, - Metrics1 + Metrics0 ), - Metrics = maps:put(subscriptions_ram, RamSubs, Metrics2), + Metrics = maps:put(subscriptions_ram, RamSubs, Metrics1), maps:update_with( connections, fun(RamConns) -> RamConns + DisconnectedDSs end, diff --git a/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl b/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl index 3d9536299..2560f8f2a 100644 --- a/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl +++ b/apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl @@ -253,7 +253,7 @@ maybe_reject_cluster_only_metrics(<<"all">>, Rates) -> Rates; maybe_reject_cluster_only_metrics(_Node, Rates) -> ClusterOnlyMetrics = [ - durable_subscriptions, + subscriptions_durable, disconnected_durable_sessions ], maps:without(ClusterOnlyMetrics, Rates). diff --git a/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl b/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl index 8286810db..edf7c9bbf 100644 --- a/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl +++ b/apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl @@ -49,6 +49,8 @@ "}" >>). +-define(ON(NODE, BODY), erpc:call(NODE, fun() -> BODY end)). + %%-------------------------------------------------------------------- %% CT boilerplate %%-------------------------------------------------------------------- @@ -79,21 +81,37 @@ end_per_suite(_Config) -> ok. init_per_group(persistent_sessions = Group, Config) -> - Apps = emqx_cth_suite:start( + AppSpecsFn = fun(Enable) -> + Port = + case Enable of + true -> "18083"; + false -> "0" + end, [ emqx_conf, {emqx, "durable_sessions {enable = true}"}, {emqx_retainer, ?BASE_RETAINER_CONF}, emqx_management, emqx_mgmt_api_test_util:emqx_dashboard( - "dashboard.listeners.http { enable = true, bind = 18083 }\n" - "dashboard.sample_interval = 1s" + lists:concat([ + "dashboard.listeners.http { bind = " ++ Port ++ " }\n", + "dashboard.sample_interval = 1s\n", + "dashboard.listeners.http.enable = " ++ atom_to_list(Enable) + ]) ) - ], - #{work_dir => emqx_cth_suite:work_dir(Group, Config)} - ), - {ok, _} = emqx_common_test_http:create_default_app(), - [{apps, Apps} | Config]; + ] + end, + NodeSpecs = [ + {dashboard_monitor1, #{apps => AppSpecsFn(true)}}, + {dashboard_monitor2, #{apps => AppSpecsFn(false)}} + ], + Nodes = + [N1 | _] = emqx_cth_cluster:start( + NodeSpecs, + #{work_dir => emqx_cth_suite:work_dir(Group, Config)} + ), + ?ON(N1, {ok, _} = emqx_common_test_http:create_default_app()), + [{cluster, Nodes} | Config]; init_per_group(common = Group, Config) -> Apps = emqx_cth_suite:start( [ @@ -111,7 +129,11 @@ init_per_group(common = Group, Config) -> {ok, _} = emqx_common_test_http:create_default_app(), [{apps, Apps} | Config]. -end_per_group(_Group, Config) -> +end_per_group(persistent_sessions, Config) -> + Cluster = ?config(cluster, Config), + emqx_cth_cluster:stop(Cluster), + ok; +end_per_group(common, Config) -> Apps = ?config(apps, Config), emqx_cth_suite:stop(Apps), ok. @@ -325,26 +347,36 @@ t_monitor_api_error(_) -> ok. %% Verifies that subscriptions from persistent sessions are correctly accounted for. -t_persistent_session_stats(_Config) -> +t_persistent_session_stats(Config) -> + [N1, N2 | _] = ?config(cluster, Config), %% pre-condition - true = emqx_persistent_message:is_persistence_enabled(), + true = ?ON(N1, emqx_persistent_message:is_persistence_enabled()), + Port1 = get_mqtt_port(N1, tcp), + Port2 = get_mqtt_port(N2, tcp), NonPSClient = start_and_connect(#{ + port => Port1, clientid => <<"non-ps">>, expiry_interval => 0 }), - PSClient = start_and_connect(#{ - clientid => <<"ps">>, + PSClient1 = start_and_connect(#{ + port => Port1, + clientid => <<"ps1">>, + expiry_interval => 30 + }), + PSClient2 = start_and_connect(#{ + port => Port2, + clientid => <<"ps2">>, expiry_interval => 30 }), {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(NonPSClient, <<"non/ps/topic/+">>, 2), {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(NonPSClient, <<"non/ps/topic">>, 2), {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(NonPSClient, <<"common/topic/+">>, 2), {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(NonPSClient, <<"common/topic">>, 2), - {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient, <<"ps/topic/+">>, 2), - {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient, <<"ps/topic">>, 2), - {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient, <<"common/topic/+">>, 2), - {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient, <<"common/topic">>, 2), + {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient1, <<"ps/topic/+">>, 2), + {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient1, <<"ps/topic">>, 2), + {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient1, <<"common/topic/+">>, 2), + {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(PSClient1, <<"common/topic">>, 2), {ok, _} = snabbkaffe:block_until( ?match_n_events(2, #{?snk_kind := dashboard_monitor_flushed}), @@ -353,7 +385,7 @@ t_persistent_session_stats(_Config) -> ?retry(1_000, 10, begin ?assertMatch( {ok, #{ - <<"connections">> := 2, + <<"connections">> := 3, <<"disconnected_durable_sessions">> := 0, %% N.B.: we currently don't perform any deduplication between persistent %% and non-persistent routes, so we count `commont/topic' twice and get 8 @@ -363,25 +395,25 @@ t_persistent_session_stats(_Config) -> <<"subscriptions_ram">> := 4, <<"subscriptions_durable">> := 4 }}, - request(["monitor_current"]) + ?ON(N1, request(["monitor_current"])) ) end), %% Sanity checks - PSRouteCount = emqx_persistent_session_ds_router:stats(n_routes), + PSRouteCount = ?ON(N1, emqx_persistent_session_ds_router:stats(n_routes)), ?assert(PSRouteCount > 0, #{ps_route_count => PSRouteCount}), - PSSubCount = emqx_persistent_session_bookkeeper:get_subscription_count(), + PSSubCount = ?ON(N1, emqx_persistent_session_bookkeeper:get_subscription_count()), ?assert(PSSubCount > 0, #{ps_sub_count => PSSubCount}), %% Now with disconnected but alive persistent sessions {ok, {ok, _}} = ?wait_async_action( - emqtt:disconnect(PSClient), + emqtt:disconnect(PSClient1), #{?snk_kind := dashboard_monitor_flushed} ), ?retry(1_000, 10, begin ?assertMatch( {ok, #{ - <<"connections">> := 2, + <<"connections">> := 3, <<"disconnected_durable_sessions">> := 1, %% N.B.: we currently don't perform any deduplication between persistent %% and non-persistent routes, so we count `commont/topic' twice and get 8 @@ -391,7 +423,28 @@ t_persistent_session_stats(_Config) -> <<"subscriptions_ram">> := 4, <<"subscriptions_durable">> := 4 }}, - request(["monitor_current"]) + ?ON(N1, request(["monitor_current"])) + ) + end), + {ok, {ok, _}} = + ?wait_async_action( + emqtt:disconnect(PSClient2), + #{?snk_kind := dashboard_monitor_flushed} + ), + ?retry(1_000, 10, begin + ?assertMatch( + {ok, #{ + <<"connections">> := 3, + <<"disconnected_durable_sessions">> := 2, + %% N.B.: we currently don't perform any deduplication between persistent + %% and non-persistent routes, so we count `commont/topic' twice and get 8 + %% instead of 6 here. + <<"topics">> := 8, + <<"subscriptions">> := 8, + <<"subscriptions_ram">> := 4, + <<"subscriptions_durable">> := 4 + }}, + ?ON(N1, request(["monitor_current"])) ) end), @@ -467,15 +520,21 @@ waiting_emqx_stats_and_monitor_update(WaitKey) -> ok. start_and_connect(Opts) -> - Defaults = #{clean_start => false, expiry_interval => 30}, + Defaults = #{ + clean_start => false, + expiry_interval => 30, + port => 1883 + }, #{ clientid := ClientId, clean_start := CleanStart, - expiry_interval := EI + expiry_interval := EI, + port := Port } = maps:merge(Defaults, Opts), {ok, Client} = emqtt:start_link([ {clientid, ClientId}, {clean_start, CleanStart}, + {port, Port}, {proto_ver, v5}, {properties, #{'Session-Expiry-Interval' => EI}} ]), @@ -484,3 +543,7 @@ start_and_connect(Opts) -> end), {ok, _} = emqtt:connect(Client), Client. + +get_mqtt_port(Node, Type) -> + {_IP, Port} = ?ON(Node, emqx_config:get([listeners, Type, default, bind])), + Port. From 839b143fc464aeb8d6a4da3ae1cec372cdc332ba Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Thu, 23 May 2024 14:48:39 +0200 Subject: [PATCH 52/55] fix(sessds): consider message durability disabled by default --- apps/emqx/src/emqx_persistent_message.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx/src/emqx_persistent_message.erl b/apps/emqx/src/emqx_persistent_message.erl index 8e67856d7..d4492f721 100644 --- a/apps/emqx/src/emqx_persistent_message.erl +++ b/apps/emqx/src/emqx_persistent_message.erl @@ -54,7 +54,7 @@ init() -> -spec is_persistence_enabled() -> boolean(). is_persistence_enabled() -> - persistent_term:get(?PERSISTENCE_ENABLED). + persistent_term:get(?PERSISTENCE_ENABLED, false). -spec is_persistence_enabled(emqx_types:zone()) -> boolean(). is_persistence_enabled(Zone) -> From ba6382adaed40af8c07d2fa857e673501d39572a Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Thu, 23 May 2024 14:54:01 +0200 Subject: [PATCH 53/55] fix(dsrepl): trigger "last-resort" pending transitions handler when idle This is a hack to work around the unintended issues causing shard allocator to become idle even when there are pending transitions. --- .../src/emqx_ds_replication_shard_allocator.erl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl b/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl index d198b2ddd..cbaafc718 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl @@ -41,6 +41,7 @@ -define(shard_meta(DB, SHARD), {?MODULE, DB, SHARD}). -define(ALLOCATE_RETRY_TIMEOUT, 1_000). +-define(TRIGGER_PENDING_TIMEOUT, 60_000). -define(TRANS_RETRY_TIMEOUT, 5_000). -define(CRASH_RETRY_DELAY, 20_000). @@ -106,7 +107,7 @@ handle_call(_Call, _From, State) -> -spec handle_cast(_Cast, state()) -> {noreply, state()}. handle_cast(#trigger_transitions{}, State) -> - {noreply, handle_pending_transitions(State)}; + {noreply, handle_pending_transitions(State), ?TRIGGER_PENDING_TIMEOUT}; handle_cast(_Cast, State) -> {noreply, State}. @@ -118,13 +119,15 @@ handle_cast(_Cast, State) -> handle_info({timeout, _TRef, allocate}, State) -> {noreply, handle_allocate_shards(State)}; handle_info({changed, {shard, DB, Shard}}, State = #{db := DB}) -> - {noreply, handle_shard_changed(Shard, State)}; + {noreply, handle_shard_changed(Shard, State), ?TRIGGER_PENDING_TIMEOUT}; handle_info({changed, _}, State) -> - {noreply, State}; + {noreply, State, ?TRIGGER_PENDING_TIMEOUT}; handle_info({'EXIT', Pid, Reason}, State) -> - {noreply, handle_exit(Pid, Reason, State)}; + {noreply, handle_exit(Pid, Reason, State), ?TRIGGER_PENDING_TIMEOUT}; +handle_info(timeout, State) -> + {noreply, handle_pending_transitions(State), ?TRIGGER_PENDING_TIMEOUT}; handle_info(_Info, State) -> - {noreply, State}. + {noreply, State, ?TRIGGER_PENDING_TIMEOUT}. -spec terminate(_Reason, state()) -> _Ok. terminate(_Reason, State = #{db := DB, shards := Shards}) -> From 5584b658c90fe5137e9d6cd5864da33e6f7033b8 Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Thu, 23 May 2024 16:35:48 +0200 Subject: [PATCH 54/55] fix(sessds): Change the defaults to renew streams every 1 second --- apps/emqx/src/emqx_schema.erl | 2 +- apps/emqx/test/emqx_config_SUITE.erl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/emqx/src/emqx_schema.erl b/apps/emqx/src/emqx_schema.erl index c3584d7dd..948849ccb 100644 --- a/apps/emqx/src/emqx_schema.erl +++ b/apps/emqx/src/emqx_schema.erl @@ -1690,7 +1690,7 @@ fields("durable_sessions") -> sc( timeout_duration(), #{ - default => <<"5000ms">>, + default => <<"1s">>, importance => ?IMPORTANCE_HIDDEN } )}, diff --git a/apps/emqx/test/emqx_config_SUITE.erl b/apps/emqx/test/emqx_config_SUITE.erl index d32b88b07..28f542f81 100644 --- a/apps/emqx/test/emqx_config_SUITE.erl +++ b/apps/emqx/test/emqx_config_SUITE.erl @@ -473,7 +473,7 @@ zone_global_defaults() -> idle_poll_interval => 100, heartbeat_interval => 5000, message_retention_period => 86400000, - renew_streams_interval => 5000, + renew_streams_interval => 1000, session_gc_batch_size => 100, session_gc_interval => 600000, subscription_count_refresh_interval => 5000, From d47bf8076a170f4f3495b8f2d1d13c7b5aef883f Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Thu, 23 May 2024 17:53:13 +0200 Subject: [PATCH 55/55] test(mgmt): better isolate testsuite running environments --- .../test/emqx_mgmt_api_publish_SUITE.erl | 15 +++++++++++---- .../test/emqx_mgmt_api_stats_SUITE.erl | 1 - .../test/emqx_mgmt_api_status_SUITE.erl | 15 +++++++++++---- apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl | 15 +++++++++++---- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/apps/emqx_management/test/emqx_mgmt_api_publish_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_publish_SUITE.erl index cb44a43f6..dc7085c42 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_publish_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_publish_SUITE.erl @@ -29,11 +29,18 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - emqx_mgmt_api_test_util:init_suite(), - Config. + Apps = emqx_cth_suite:start( + [ + emqx, + emqx_management, + emqx_mgmt_api_test_util:emqx_dashboard() + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + [{apps, Apps} | Config]. -end_per_suite(_) -> - emqx_mgmt_api_test_util:end_suite(). +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)). init_per_testcase(Case, Config) -> ?MODULE:Case({init, Config}). diff --git a/apps/emqx_management/test/emqx_mgmt_api_stats_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_stats_SUITE.erl index 1db30a73b..05eaa830c 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_stats_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_stats_SUITE.erl @@ -33,7 +33,6 @@ init_per_suite(Config) -> ], #{work_dir => emqx_cth_suite:work_dir(Config)} ), - {ok, _Api} = emqx_common_test_http:create_default_app(), [{apps, Apps} | Config]. end_per_suite(Config) -> diff --git a/apps/emqx_management/test/emqx_mgmt_api_status_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_status_SUITE.erl index 36ee4830d..64d9b032f 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_status_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_status_SUITE.erl @@ -51,11 +51,18 @@ groups() -> ]. init_per_suite(Config) -> - emqx_mgmt_api_test_util:init_suite(), - Config. + Apps = emqx_cth_suite:start( + [ + emqx, + emqx_management, + emqx_mgmt_api_test_util:emqx_dashboard() + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + [{apps, Apps} | Config]. -end_per_suite(_) -> - emqx_mgmt_api_test_util:end_suite(). +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)). init_per_group(api_status_endpoint, Config) -> [{get_status_path, ["api", "v5", "status"]} | Config]; diff --git a/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl index b1d646b40..f896468b7 100644 --- a/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl @@ -25,12 +25,19 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - emqx_mgmt_api_test_util:init_suite([emqx_conf, emqx_management]), + Apps = emqx_cth_suite:start( + [ + emqx_conf, + emqx_management, + emqx_mgmt_api_test_util:emqx_dashboard() + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), ok = emqx_mgmt_cli:load(), - Config. + [{apps, Apps} | Config]. -end_per_suite(_) -> - emqx_mgmt_api_test_util:end_suite([emqx_management, emqx_conf]). +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(apps, Config)). init_per_testcase(t_autocluster_leave = TC, Config) -> [Core1, Core2, Repl1, Repl2] =