From 115ab856706bf80382de295c28c9cc754ae4707c Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Fri, 4 Aug 2023 15:31:34 +0200 Subject: [PATCH 01/85] chore: upgrade Kafka client wolff to 1.7.7 --- apps/emqx_bridge_azure_event_hub/rebar.config | 2 +- apps/emqx_bridge_kafka/rebar.config | 2 +- mix.exs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/emqx_bridge_azure_event_hub/rebar.config b/apps/emqx_bridge_azure_event_hub/rebar.config index 85c39ce01..dbcc8269c 100644 --- a/apps/emqx_bridge_azure_event_hub/rebar.config +++ b/apps/emqx_bridge_azure_event_hub/rebar.config @@ -1,6 +1,6 @@ %% -*- mode: erlang; -*- {erl_opts, [debug_info]}. -{deps, [ {wolff, {git, "https://github.com/kafka4beam/wolff.git", {tag, "1.7.6"}}} +{deps, [ {wolff, {git, "https://github.com/kafka4beam/wolff.git", {tag, "1.7.7"}}} , {kafka_protocol, {git, "https://github.com/kafka4beam/kafka_protocol.git", {tag, "4.1.3"}}} , {brod_gssapi, {git, "https://github.com/kafka4beam/brod_gssapi.git", {tag, "v0.1.0"}}} , {brod, {git, "https://github.com/kafka4beam/brod.git", {tag, "3.16.8"}}} diff --git a/apps/emqx_bridge_kafka/rebar.config b/apps/emqx_bridge_kafka/rebar.config index 945ccbdba..8246fa8cf 100644 --- a/apps/emqx_bridge_kafka/rebar.config +++ b/apps/emqx_bridge_kafka/rebar.config @@ -1,6 +1,6 @@ %% -*- mode: erlang; -*- {erl_opts, [debug_info]}. -{deps, [ {wolff, {git, "https://github.com/kafka4beam/wolff.git", {tag, "1.7.6"}}} +{deps, [ {wolff, {git, "https://github.com/kafka4beam/wolff.git", {tag, "1.7.7"}}} , {kafka_protocol, {git, "https://github.com/kafka4beam/kafka_protocol.git", {tag, "4.1.3"}}} , {brod_gssapi, {git, "https://github.com/kafka4beam/brod_gssapi.git", {tag, "v0.1.0"}}} , {brod, {git, "https://github.com/kafka4beam/brod.git", {tag, "3.16.8"}}} diff --git a/mix.exs b/mix.exs index 00d190136..836d4649a 100644 --- a/mix.exs +++ b/mix.exs @@ -203,7 +203,7 @@ defmodule EMQXUmbrella.MixProject do [ {:hstreamdb_erl, github: "hstreamdb/hstreamdb_erl", tag: "0.3.1+v0.12.0"}, {:influxdb, github: "emqx/influxdb-client-erl", tag: "1.1.11", override: true}, - {:wolff, github: "kafka4beam/wolff", tag: "1.7.6"}, + {:wolff, github: "kafka4beam/wolff", tag: "1.7.7"}, {:kafka_protocol, github: "kafka4beam/kafka_protocol", tag: "4.1.3", override: true}, {:brod_gssapi, github: "kafka4beam/brod_gssapi", tag: "v0.1.0"}, {:brod, github: "kafka4beam/brod", tag: "3.16.8"}, From ff88e508a6fc04676711436a89a26b86754f1f0e Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Mon, 7 Aug 2023 10:39:04 +0200 Subject: [PATCH 02/85] docs: add changelog --- changes/ee/fix-11394.en.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 changes/ee/fix-11394.en.md diff --git a/changes/ee/fix-11394.en.md b/changes/ee/fix-11394.en.md new file mode 100644 index 000000000..ace678ecc --- /dev/null +++ b/changes/ee/fix-11394.en.md @@ -0,0 +1,2 @@ +Upgrade Kafka producer client `wolff` from 1.7.6 to 1.7.7. +This fixes a potential race condition which may cause all Kafka producers to crash if some failed to initialize. From 3ed031db706e1eb19be6b61a74194f188f8dc029 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Mon, 7 Aug 2023 16:10:30 +0200 Subject: [PATCH 03/85] fix: rule SQL mongo_date function should return a string in test mode The rule SQL mongo_date function should return a string with the format ISODate(*), where * is an ISO date string when running the rule in test mode. Fixes: https://emqx.atlassian.net/browse/EMQX-10727 --- .../src/emqx_rule_engine.app.src | 2 +- apps/emqx_rule_engine/src/emqx_rule_funcs.erl | 19 +++++++++++++-- .../src/emqx_rule_sqltester.erl | 23 ++++++++++++++++++- .../emqx_rule_engine_api_rule_test_SUITE.erl | 21 +++++++++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src index 09d57a4f9..e6d00bcae 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src @@ -2,7 +2,7 @@ {application, emqx_rule_engine, [ {description, "EMQX Rule Engine"}, % strict semver, bump manually! - {vsn, "5.0.22"}, + {vsn, "5.0.23"}, {modules, []}, {registered, [emqx_rule_engine_sup, emqx_rule_engine]}, {applications, [kernel, stdlib, rulesql, getopt, emqx_ctl, uuid]}, diff --git a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl index 64522ee60..0c55f92b4 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl @@ -1185,16 +1185,31 @@ function_literal(Fun, Args) -> {invalid_func, {Fun, Args}}. mongo_date() -> - erlang:timestamp(). + maybe_isodate_format(erlang:timestamp()). mongo_date(MillisecondsTimestamp) -> - convert_timestamp(MillisecondsTimestamp). + maybe_isodate_format(convert_timestamp(MillisecondsTimestamp)). mongo_date(Timestamp, Unit) -> InsertedTimeUnit = time_unit(Unit), ScaledEpoch = erlang:convert_time_unit(Timestamp, InsertedTimeUnit, millisecond), convert_timestamp(ScaledEpoch). +maybe_isodate_format(ErlTimestamp) -> + case emqx_rule_sqltester:is_test_runtime_env() of + false -> + ErlTimestamp; + true -> + %% if this is called from sqltest, we need to convert it to the ISODate() format, + %% so that it can be correctly converted into a JSON string. + isodate_format(ErlTimestamp) + end. + +isodate_format({MegaSecs, Secs, MicroSecs}) -> + SystemTimeMs = (MegaSecs * 1000_000_000_000 + Secs * 1000_000 + MicroSecs) div 1000, + Ts3339Str = calendar:system_time_to_rfc3339(SystemTimeMs, [{unit, millisecond}, {offset, "Z"}]), + iolist_to_binary(["ISODate(", Ts3339Str, ")"]). + convert_timestamp(MillisecondsTimestamp) -> MicroTimestamp = MillisecondsTimestamp * 1000, MegaSecs = MicroTimestamp div 1000_000_000_000, diff --git a/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl b/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl index f3b4e2790..867fffcc1 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_sqltester.erl @@ -18,7 +18,9 @@ -export([ test/1, - get_selected_data/3 + get_selected_data/3, + %% Some SQL functions return different results in the test environment + is_test_runtime_env/0 ]). -spec test(#{sql := binary(), context := map()}) -> {ok, map() | list()} | {error, term()}. @@ -63,12 +65,14 @@ test_rule(Sql, Select, Context, EventTopics) -> created_at => erlang:system_time(millisecond) }, FullContext = fill_default_values(hd(EventTopics), emqx_rule_maps:atom_key_map(Context)), + set_is_test_runtime_env(), try emqx_rule_runtime:apply_rule(Rule, FullContext, #{}) of {ok, Data} -> {ok, flatten(Data)}; {error, Reason} -> {error, Reason} after + unset_is_test_runtime_env(), ok = emqx_rule_engine:clear_metrics_for_rule(RuleId) end. @@ -97,3 +101,20 @@ envs_examp(EventTopic) -> emqx_rule_events:columns_with_exam(EventName) ) ). + +is_test_runtime_env_atom() -> + 'emqx_rule_sqltester:is_test_runtime_env'. + +set_is_test_runtime_env() -> + erlang:put(is_test_runtime_env_atom(), true), + ok. + +unset_is_test_runtime_env() -> + erlang:erase(is_test_runtime_env_atom()), + ok. + +is_test_runtime_env() -> + case erlang:get(is_test_runtime_env_atom()) of + true -> true; + _ -> false + end. diff --git a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl index 575d35238..0c772958e 100644 --- a/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl +++ b/apps/emqx_rule_engine/test/emqx_rule_engine_api_rule_test_SUITE.erl @@ -214,6 +214,27 @@ t_ctx_delivery_dropped(_) -> Expected = check_result([from_clientid, from_username, reason, qos, topic], [], Context), do_test(SQL, Context, Expected). +t_mongo_date_function_should_return_string_in_test_env(_) -> + SQL = + <<"SELECT mongo_date() as mongo_date FROM \"t/1\"">>, + Context = + #{ + action => <<"publish">>, + clientid => <<"c_emqx">>, + event_type => client_check_authz_complete, + result => <<"allow">>, + topic => <<"t/1">>, + username => <<"u_emqx">> + }, + CheckFunction = fun(Result) -> + MongoDate = maps:get(mongo_date, Result), + %% Use regex to match the expected string + MatchResult = re:run(MongoDate, <<"ISODate\\([0-9]{4}-[0-9]{2}-[0-9]{2}T.*\\)">>), + ?assertMatch({match, _}, MatchResult), + ok + end, + do_test(SQL, Context, CheckFunction). + do_test(SQL, Context, Expected0) -> Res = emqx_rule_engine_api:'/rule_test'( post, From 19d091eef1e833f86fe3a27ecc56bb175b5ab468 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Mon, 7 Aug 2023 16:28:39 +0200 Subject: [PATCH 04/85] docs: add changelog entry --- changes/ee/fix-11401.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ee/fix-11401.en.md diff --git a/changes/ee/fix-11401.en.md b/changes/ee/fix-11401.en.md new file mode 100644 index 000000000..2bce7170a --- /dev/null +++ b/changes/ee/fix-11401.en.md @@ -0,0 +1 @@ +When running one of the rule engine SQL `mongo_date` functions in the EMQX dashboard test interface, the resulting date is formatted as `ISODate(*)`, where * is the date in ISO date format instead of only the ISO date string. This is the format used by MongoDB to store dates. From b38461e50af081f1557ca7a196cf68a0bfd06879 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Tue, 8 Aug 2023 14:47:07 +0200 Subject: [PATCH 05/85] fix: mongo_date/2 shall give user friendly value in the test environment --- apps/emqx_rule_engine/src/emqx_rule_funcs.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl index 0c55f92b4..e498dc642 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_funcs.erl +++ b/apps/emqx_rule_engine/src/emqx_rule_funcs.erl @@ -1193,7 +1193,7 @@ mongo_date(MillisecondsTimestamp) -> mongo_date(Timestamp, Unit) -> InsertedTimeUnit = time_unit(Unit), ScaledEpoch = erlang:convert_time_unit(Timestamp, InsertedTimeUnit, millisecond), - convert_timestamp(ScaledEpoch). + mongo_date(ScaledEpoch). maybe_isodate_format(ErlTimestamp) -> case emqx_rule_sqltester:is_test_runtime_env() of From 7b44caeae4d108c4707ec69af0a4515ebcfd843f Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Wed, 16 Aug 2023 15:48:18 +0200 Subject: [PATCH 06/85] fix(emqx_machine_boot): Fix excluded_apps --- apps/emqx_machine/src/emqx_machine_boot.erl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/emqx_machine/src/emqx_machine_boot.erl b/apps/emqx_machine/src/emqx_machine_boot.erl index 82b909b4f..481927765 100644 --- a/apps/emqx_machine/src/emqx_machine_boot.erl +++ b/apps/emqx_machine/src/emqx_machine_boot.erl @@ -30,12 +30,19 @@ -export([sorted_reboot_apps/1, reboot_apps/0]). -endif. -%% these apps are always (re)started by emqx_machine +%% These apps are always (re)started by emqx_machine: -define(BASIC_REBOOT_APPS, [gproc, esockd, ranch, cowboy, emqx]). -%% If any of these applications crash, the entire EMQX node shuts down +%% If any of these applications crash, the entire EMQX node shuts down: -define(BASIC_PERMANENT_APPS, [mria, ekka, esockd, emqx]). +%% These apps should NOT be (re)started automatically: +-define(EXCLUDED_APPS, [system_monitor, observer_cli, jq]). + +%% These apps are optional, they may or may not be present in the +%% release, depending on the build flags: +-define(OPTIONAL_APPS, [bcrypt, observer]). + post_boot() -> ok = ensure_apps_started(), ok = print_vsn(), @@ -148,9 +155,9 @@ basic_reboot_apps() -> ?BASIC_REBOOT_APPS ++ (BusinessApps -- excluded_apps()). excluded_apps() -> - OptionalApps = [bcrypt, jq, observer], - [system_monitor, observer_cli] ++ - [App || App <- OptionalApps, not is_app(App)]. + %% Optional apps _should_ be (re)started automatically, but only + %% when they are found in the release: + ?EXCLUDED_APPS ++ [App || App <- ?OPTIONAL_APPS, not is_app(App)]. is_app(Name) -> case application:load(Name) of From 8adef6b6574c8fb31b0a389dfa539e30d5a1fb07 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Thu, 17 Aug 2023 09:38:06 -0300 Subject: [PATCH 07/85] ci: enumerate integration test suites --- scripts/find-suites.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/find-suites.sh b/scripts/find-suites.sh index 685ab5ec8..47799f885 100755 --- a/scripts/find-suites.sh +++ b/scripts/find-suites.sh @@ -19,8 +19,14 @@ if [ -n "${EMQX_CT_SUITES:-}" ]; then fi TESTDIR="$1/test" +INTEGRATION_TESTDIR="$1/integration_test" # Get the output of the find command IFS=$'\n' read -r -d '' -a FILES < <(find "${TESTDIR}" -name "*_SUITE.erl" 2>/dev/null | sort && printf '\0') +if [[ -d "${INTEGRATION_TESTDIR}" ]]; then + IFS=$'\n' read -r -d '' -a FILES_INTEGRATION < <(find "${INTEGRATION_TESTDIR}" -name "*_SUITE.erl" 2>/dev/null | sort && printf '\0') +fi +# shellcheck disable=SC2206 +FILES+=(${FILES_INTEGRATION:-}) SUITEGROUP_RAW="${SUITEGROUP:-1_1}" SUITEGROUP="$(echo "$SUITEGROUP_RAW" | cut -d '_' -f1)" From 3ffbb7decf36c3f19318ab737d434282811c8d01 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Tue, 22 Aug 2023 11:32:55 +0200 Subject: [PATCH 08/85] fix: HTTP API /api/v5/publish schema The schema for the /api/v5/publish HTTP API endpoint was incorrect. For 400 (Bad Request) error it cannot return a list but the incorrect schema declared that the response could include a list. Fixes: https://emqx.atlassian.net/browse/EMQX-10837 https://github.com/emqx/emqx/issues/11488 --- apps/emqx_management/src/emqx_mgmt_api_publish.erl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/emqx_management/src/emqx_mgmt_api_publish.erl b/apps/emqx_management/src/emqx_mgmt_api_publish.erl index ba486ab89..f0834af96 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_publish.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_publish.erl @@ -57,7 +57,7 @@ schema("/publish") -> responses => #{ ?ALL_IS_WELL => hoconsc:mk(hoconsc:ref(?MODULE, publish_ok)), ?PARTIALLY_OK => hoconsc:mk(hoconsc:ref(?MODULE, publish_error)), - ?BAD_REQUEST => bad_request_schema(), + ?BAD_REQUEST => hoconsc:mk(hoconsc:ref(?MODULE, bad_request)), ?DISPATCH_ERROR => hoconsc:mk(hoconsc:ref(?MODULE, publish_error)) } } @@ -196,11 +196,13 @@ fields(bad_request) -> [ {code, hoconsc:mk(string(), #{ - desc => <<"BAD_REQUEST">> + desc => <<"BAD_REQUEST">>, + example => ?RC_TOPIC_NAME_INVALID })}, {message, hoconsc:mk(binary(), #{ - desc => ?DESC(error_message) + desc => ?DESC(error_message), + example => to_binary(emqx_reason_codes:name(?RC_TOPIC_NAME_INVALID)) })} ]. From 88f7c2b4d888de83c070d850fb76fc937f7d5aab Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Tue, 22 Aug 2023 15:48:27 +0200 Subject: [PATCH 09/85] docs: changelog entry for HTTP publish bad request schema --- changes/ce/fix-11493.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ce/fix-11493.en.md diff --git a/changes/ce/fix-11493.en.md b/changes/ce/fix-11493.en.md new file mode 100644 index 000000000..b05a0d02f --- /dev/null +++ b/changes/ce/fix-11493.en.md @@ -0,0 +1 @@ +Example for and documentation for /api/v5/publish bad request response has been fixed. Previously the documentation example said that the bad request response could return a list in the body which was not actually the case. From 19acd82436c4b4bf5fa4441c44a1a0f4923dd109 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Tue, 22 Aug 2023 16:41:11 +0200 Subject: [PATCH 10/85] docs: fix change log entry according to @thalesmg's suggestion Co-authored-by: Thales Macedo Garitezi --- changes/ce/fix-11493.en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/ce/fix-11493.en.md b/changes/ce/fix-11493.en.md index b05a0d02f..93874933a 100644 --- a/changes/ce/fix-11493.en.md +++ b/changes/ce/fix-11493.en.md @@ -1 +1 @@ -Example for and documentation for /api/v5/publish bad request response has been fixed. Previously the documentation example said that the bad request response could return a list in the body which was not actually the case. +Examples and documentation for /api/v5/publish bad request response have been fixed. Previously the documentation example said that the bad request response could return a list in the body which was not actually the case. From ca697a4e14343ce30cb78b84800451fc8a7c845d Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Thu, 27 Jul 2023 11:36:29 -0300 Subject: [PATCH 11/85] fix: rename `emqx_ds{,_replay}:replay_id()` --- .../src/emqx_ds_message_storage_bitmask.erl | 8 ++++---- .../src/emqx_ds_storage_layer.erl | 12 ++++++------ .../props/emqx_ds_message_storage_bitmask_shim.erl | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl b/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl index 57608e5cb..74a50c302 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl @@ -277,13 +277,13 @@ store(DB = #db{handle = DBHandle, cf = CFHandle}, MessageID, PublishedAt, Topic, Value = make_message_value(Topic, MessagePayload), rocksdb:put(DBHandle, CFHandle, Key, Value, DB#db.write_options). --spec make_iterator(db(), emqx_ds:replay()) -> +-spec make_iterator(db(), emqx_ds_replay:replay()) -> {ok, iterator()} | {error, _TODO}. make_iterator(DB, Replay) -> Options = emqx_ds_conf:shard_iteration_options(DB#db.shard), make_iterator(DB, Replay, Options). --spec make_iterator(db(), emqx_ds:replay(), iteration_options()) -> +-spec make_iterator(db(), emqx_ds_replay:replay(), iteration_options()) -> % {error, invalid_start_time}? might just start from the beginning of time % and call it a day: client violated the contract anyway. {ok, iterator()} | {error, _TODO}. @@ -337,7 +337,7 @@ preserve_iterator(#it{cursor = Cursor}) -> }, term_to_binary(State). --spec restore_iterator(db(), emqx_ds:replay(), binary()) -> +-spec restore_iterator(db(), emqx_ds_replay:replay(), binary()) -> {ok, iterator()} | {error, _TODO}. restore_iterator(DB, Replay, Serial) when is_binary(Serial) -> State = binary_to_term(Serial), @@ -419,7 +419,7 @@ hash(Input, Bits) -> % at most 32 bits erlang:phash2(Input, 1 bsl Bits). --spec make_keyspace_filter(emqx_ds:replay(), keymapper()) -> keyspace_filter(). +-spec make_keyspace_filter(emqx_ds_replay:replay(), keymapper()) -> keyspace_filter(). make_keyspace_filter({TopicFilter, StartTime}, Keymapper) -> Bitstring = compute_bitstring(TopicFilter, StartTime, Keymapper), HashBitmask = compute_topic_bitmask(TopicFilter, Keymapper), diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 017423b02..7c73dafaf 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -63,7 +63,7 @@ -record(it, { shard :: emqx_ds:shard(), gen :: gen_id(), - replay :: emqx_ds:replay(), + replay :: emqx_ds_replay:replay(), module :: module(), data :: term() }). @@ -104,10 +104,10 @@ -callback store(_Schema, binary(), emqx_ds:time(), emqx_ds:topic(), binary()) -> ok | {error, _}. --callback make_iterator(_Schema, emqx_ds:replay()) -> +-callback make_iterator(_Schema, emqx_ds_replay:replay()) -> {ok, _It} | {error, _}. --callback restore_iterator(_Schema, emqx_ds:replay(), binary()) -> {ok, _It} | {error, _}. +-callback restore_iterator(_Schema, emqx_ds_replay:replay(), binary()) -> {ok, _It} | {error, _}. -callback preserve_iterator(_Schema, _It) -> term(). @@ -132,7 +132,7 @@ store(Shard, GUID, Time, Topic, Msg) -> {_GenId, #{module := Mod, data := Data}} = meta_lookup_gen(Shard, Time), Mod:store(Data, GUID, Time, Topic, Msg). --spec make_iterator(emqx_ds:shard(), emqx_ds:replay()) -> +-spec make_iterator(emqx_ds:shard(), emqx_ds_replay:replay()) -> {ok, iterator()} | {error, _TODO}. make_iterator(Shard, Replay = {_, StartTime}) -> {GenId, Gen} = meta_lookup_gen(Shard, StartTime), @@ -160,12 +160,12 @@ next(It = #it{module = Mod, data = ItData}) -> end end. --spec preserve_iterator(iterator(), emqx_ds:replay_id()) -> +-spec preserve_iterator(iterator(), emqx_ds_replay:replay_id()) -> ok | {error, _TODO}. preserve_iterator(It = #it{}, ReplayID) -> iterator_put_state(ReplayID, It). --spec restore_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> +-spec restore_iterator(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> {ok, iterator()} | {error, _TODO}. restore_iterator(Shard, ReplayID) -> case iterator_get_state(Shard, ReplayID) of diff --git a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl index 59668ca01..bbe16f518 100644 --- a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl +++ b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl @@ -29,7 +29,7 @@ store(Tab, MessageID, PublishedAt, Topic, Payload) -> true = ets:insert(Tab, {{PublishedAt, MessageID}, Topic, Payload}), ok. --spec iterate(t(), emqx_ds:replay()) -> +-spec iterate(t(), emqx_ds_replay:replay()) -> [binary()]. iterate(Tab, {TopicFilter, StartTime}) -> ets:foldr( From 9463e271c07e5e1057dd289751e15d1b720f4014 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 25 Jul 2023 17:09:44 -0300 Subject: [PATCH 12/85] feat(ds): open iterators when handling `SUBSCRIBE` packets Fixes https://emqx.atlassian.net/browse/EMQX-9741 --- apps/emqx/include/emqx.hrl | 3 +- apps/emqx/include/emqx_session.hrl | 57 +++++ apps/emqx/priv/bpapi.versions | 1 + apps/emqx/src/emqx_cm.erl | 12 +- apps/emqx/src/emqx_persistent_session_ds.erl | 49 ++++- apps/emqx/src/emqx_session.erl | 62 ++---- apps/emqx/src/emqx_session_router.erl | 1 + .../emqx_persistent_session.erl | 16 +- .../emqx_persistent_session.hrl | 2 + .../emqx_persistent_session_gc.erl | 1 + apps/emqx/src/proto/emqx_ds_proto_v1.erl | 49 +++++ apps/emqx/test/emqx_cth_cluster.erl | 5 +- .../test/emqx_persistent_messages_SUITE.erl | 194 ++++++++++++++++-- .../test/emqx_persistent_session_SUITE.erl | 22 ++ apps/emqx/test/emqx_proper_types.erl | 45 ++-- apps/emqx_durable_storage/src/emqx_ds.erl | 62 +++++- apps/emqx_durable_storage/src/emqx_ds_int.hrl | 2 +- .../src/emqx_ds_replay.erl | 2 +- .../src/emqx_ds_storage_layer.erl | 58 +++++- .../src/emqx_durable_storage.app.src | 2 +- .../emqx_ds_message_storage_bitmask_shim.erl | 2 + .../src/emqx_mgmt_api_clients.erl | 2 +- 22 files changed, 537 insertions(+), 112 deletions(-) create mode 100644 apps/emqx/include/emqx_session.hrl create mode 100644 apps/emqx/src/proto/emqx_ds_proto_v1.erl diff --git a/apps/emqx/include/emqx.hrl b/apps/emqx/include/emqx.hrl index ac9d297de..e0d1685e8 100644 --- a/apps/emqx/include/emqx.hrl +++ b/apps/emqx/include/emqx.hrl @@ -23,7 +23,6 @@ -define(SHARED_SUB_SHARD, emqx_shared_sub_shard). -define(CM_SHARD, emqx_cm_shard). -define(ROUTE_SHARD, route_shard). --define(PERSISTENT_SESSION_SHARD, emqx_persistent_session_shard). %% Banner %%-------------------------------------------------------------------- @@ -92,7 +91,7 @@ -record(route, { topic :: binary(), - dest :: node() | {binary(), node()} | emqx_session:sessionID() + dest :: node() | {binary(), node()} | emqx_session:session_id() }). %%-------------------------------------------------------------------- diff --git a/apps/emqx/include/emqx_session.hrl b/apps/emqx/include/emqx_session.hrl new file mode 100644 index 000000000..fba4cf911 --- /dev/null +++ b/apps/emqx/include/emqx_session.hrl @@ -0,0 +1,57 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2017-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-ifndef(EMQX_SESSION_HRL). +-define(EMQX_SESSION_HRL, true). + +-record(session, { + %% Client's id + clientid :: emqx_types:clientid(), + id :: emqx_session:session_id(), + %% Is this session a persistent session i.e. was it started with Session-Expiry > 0 + is_persistent :: boolean(), + %% Client’s Subscriptions. + subscriptions :: map(), + %% Max subscriptions allowed + max_subscriptions :: non_neg_integer() | infinity, + %% Upgrade QoS? + upgrade_qos :: boolean(), + %% Client <- Broker: QoS1/2 messages sent to the client but + %% have not been unacked. + inflight :: emqx_inflight:inflight(), + %% All QoS1/2 messages published to when client is disconnected, + %% or QoS1/2 messages pending transmission to the Client. + %% + %% Optionally, QoS0 messages pending transmission to the Client. + mqueue :: emqx_mqueue:mqueue(), + %% Next packet id of the session + next_pkt_id = 1 :: emqx_types:packet_id(), + %% Retry interval for redelivering QoS1/2 messages (Unit: millisecond) + retry_interval :: timeout(), + %% Client -> Broker: QoS2 messages received from the client, but + %% have not been completely acknowledged + awaiting_rel :: map(), + %% Maximum number of awaiting QoS2 messages allowed + max_awaiting_rel :: non_neg_integer() | infinity, + %% Awaiting PUBREL Timeout (Unit: millisecond) + await_rel_timeout :: timeout(), + %% Created at + created_at :: pos_integer(), + %% Durable storage iterators for existing subscriptions + iterators = [] :: [emqx_ds_replay:replay_id()] +}). + +-endif. diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index e13f60654..b6a4c6e7a 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -15,6 +15,7 @@ {emqx_conf,3}. {emqx_dashboard,1}. {emqx_delayed,1}. +{emqx_ds,1}. {emqx_eviction_agent,1}. {emqx_exhook,1}. {emqx_ft_storage_exporter_fs,1}. diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index c680560fb..b98222959 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -21,6 +21,7 @@ -include("emqx.hrl"). -include("emqx_cm.hrl"). +-include("emqx_session.hrl"). -include("logger.hrl"). -include("types.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). @@ -301,7 +302,16 @@ open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) -> create_session(ClientInfo, ConnInfo) -> Options = get_session_confs(ClientInfo, ConnInfo), - Session = emqx_session:init(Options), + #{clientid := ClientID} = ClientInfo, + Session0 = emqx_session:init(Options), + IteratorIDs = + case emqx_persistent_session_ds:open_session(ClientID) of + {skipped, disabled} -> + []; + {_IsNew, _DSSessionID, Iterators0} -> + Iterators0 + end, + Session = Session0#session{iterators = IteratorIDs}, ok = emqx_metrics:inc('session.created'), ok = emqx_hooks:run('session.created', [ClientInfo, emqx_session:info(Session)]), Session. diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 27b4f0950..7e9db4707 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -16,15 +16,24 @@ -module(emqx_persistent_session_ds). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + -export([init/0]). --export([persist_message/1]). +-export([ + persist_message/1, + open_session/1, + add_subscription/2 +]). -export([ serialize_message/1, deserialize_message/1 ]). +%% RPC +-export([do_open_iterator/3]). + %% FIXME -define(DS_SHARD, <<"local">>). @@ -72,6 +81,44 @@ store_message(Msg) -> find_subscribers(_Msg) -> [node()]. +open_session(ClientID) -> + ?WHEN_ENABLED(emqx_ds:session_open(ClientID)). + +-spec add_subscription(emqx_types:topic(), emqx_ds:session_id()) -> + {ok, emqx_ds:iterator_id(), _IsNew :: boolean()} | {skipped, disabled}. +add_subscription(TopicFilterBin, DSSessionID) -> + ?WHEN_ENABLED( + begin + TopicFilter = emqx_topic:words(TopicFilterBin), + {ok, IteratorID, StartMS, IsNew} = emqx_ds:session_add_iterator( + DSSessionID, TopicFilter + ), + case IsNew of + true -> + ok = open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID); + false -> + ok + end, + {ok, IteratorID, IsNew} + end + ). + +-spec open_iterator_on_all_nodes(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. +open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) -> + Nodes = emqx:running_nodes(), + Results = emqx_ds_proto_v1:open_iterator(Nodes, TopicFilter, StartMS, IteratorID), + %% TODO: handle errors + true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), + ok. + +-spec do_open_iterator(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. +do_open_iterator(TopicFilter, StartMS, IteratorID) -> + Replay = {TopicFilter, StartMS}, + %% FIXME: choose DS shard based on ...? + {ok, It} = emqx_ds_storage_layer:make_iterator(?DS_SHARD, Replay), + ok = emqx_ds_storage_layer:preserve_iterator(It, IteratorID), + ok. + %% serialize_message(Msg) -> diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index d838e95d0..8f1933fc1 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -44,6 +44,7 @@ -module(emqx_session). -include("emqx.hrl"). +-include("emqx_session.hrl"). -include("emqx_mqtt.hrl"). -include("logger.hrl"). -include("types.hrl"). @@ -101,49 +102,13 @@ %% Export for CT -export([set_field/3]). --type sessionID() :: emqx_guid:guid(). +-type session_id() :: emqx_guid:guid(). -export_type([ session/0, - sessionID/0 + session_id/0 ]). --record(session, { - %% Client's id - clientid :: emqx_types:clientid(), - id :: sessionID(), - %% Is this session a persistent session i.e. was it started with Session-Expiry > 0 - is_persistent :: boolean(), - %% Client’s Subscriptions. - subscriptions :: map(), - %% Max subscriptions allowed - max_subscriptions :: non_neg_integer() | infinity, - %% Upgrade QoS? - upgrade_qos :: boolean(), - %% Client <- Broker: QoS1/2 messages sent to the client but - %% have not been unacked. - inflight :: emqx_inflight:inflight(), - %% All QoS1/2 messages published to when client is disconnected, - %% or QoS1/2 messages pending transmission to the Client. - %% - %% Optionally, QoS0 messages pending transmission to the Client. - mqueue :: emqx_mqueue:mqueue(), - %% Next packet id of the session - next_pkt_id = 1 :: emqx_types:packet_id(), - %% Retry interval for redelivering QoS1/2 messages (Unit: millisecond) - retry_interval :: timeout(), - %% Client -> Broker: QoS2 messages received from the client, but - %% have not been completely acknowledged - awaiting_rel :: map(), - %% Maximum number of awaiting QoS2 messages allowed - max_awaiting_rel :: non_neg_integer() | infinity, - %% Awaiting PUBREL Timeout (Unit: millisecond) - await_rel_timeout :: timeout(), - %% Created at - created_at :: pos_integer() - %% Message deliver latency stats -}). - -type inflight_data_phase() :: wait_ack | wait_comp. -record(inflight_data, { @@ -297,7 +262,9 @@ info(awaiting_rel_max, #session{max_awaiting_rel = Max}) -> info(await_rel_timeout, #session{await_rel_timeout = Timeout}) -> Timeout; info(created_at, #session{created_at = CreatedAt}) -> - CreatedAt. + CreatedAt; +info(iterators, #session{iterators = IteratorIds}) -> + IteratorIds. %% @doc Get stats of the session. -spec stats(session()) -> emqx_types:stats(). @@ -324,11 +291,13 @@ subscribe( case IsNew andalso is_subscriptions_full(Session) of false -> ok = emqx_broker:subscribe(TopicFilter, ClientId, SubOpts), + Session1 = Session#session{subscriptions = maps:put(TopicFilter, SubOpts, Subs)}, + Session2 = add_persistent_subscription(TopicFilter, ClientId, Session1), ok = emqx_hooks:run( 'session.subscribed', [ClientInfo, TopicFilter, SubOpts#{is_new => IsNew}] ), - {ok, Session#session{subscriptions = maps:put(TopicFilter, SubOpts, Subs)}}; + {ok, Session2}; true -> {error, ?RC_QUOTA_EXCEEDED} end. @@ -341,6 +310,19 @@ is_subscriptions_full(#session{ }) -> maps:size(Subs) >= MaxLimit. +-spec add_persistent_subscription(emqx_types:topic(), emqx_types:clientid(), session()) -> + session(). +add_persistent_subscription(TopicFilterBin, ClientId, Session) -> + case emqx_persistent_session_ds:add_subscription(TopicFilterBin, ClientId) of + {skipped, disabled} -> + Session; + {ok, IteratorID, _IsNew = true} -> + Iterators = Session#session.iterators, + Session#session{iterators = [IteratorID | Iterators]}; + {ok, _IteratorID, _IsNew = false} -> + Session + end. + %%-------------------------------------------------------------------- %% Client -> Broker: UNSUBSCRIBE %%-------------------------------------------------------------------- diff --git a/apps/emqx/src/emqx_session_router.erl b/apps/emqx/src/emqx_session_router.erl index 1567f9e62..25484bdf0 100644 --- a/apps/emqx/src/emqx_session_router.erl +++ b/apps/emqx/src/emqx_session_router.erl @@ -21,6 +21,7 @@ -include("emqx.hrl"). -include("logger.hrl"). -include("types.hrl"). +-include("persistent_session/emqx_persistent_session.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). diff --git a/apps/emqx/src/persistent_session/emqx_persistent_session.erl b/apps/emqx/src/persistent_session/emqx_persistent_session.erl index 111154571..d85e13d67 100644 --- a/apps/emqx/src/persistent_session/emqx_persistent_session.erl +++ b/apps/emqx/src/persistent_session/emqx_persistent_session.erl @@ -115,10 +115,10 @@ storage_backend() -> %% Session message ADT API %%-------------------------------------------------------------------- --spec session_message_info('timestamp' | 'sessionID', sess_msg_key()) -> term(). +-spec session_message_info('timestamp' | 'session_id', sess_msg_key()) -> term(). session_message_info(timestamp, {_, <<>>, <>, ?ABANDONED}) -> TS; session_message_info(timestamp, {_, GUID, _, _}) -> emqx_guid:timestamp(GUID); -session_message_info(sessionID, {SessionID, _, _, _}) -> SessionID. +session_message_info(session_id, {SessionID, _, _, _}) -> SessionID. %%-------------------------------------------------------------------- %% DB API @@ -243,7 +243,7 @@ discard_opt(true, ClientID, Session) -> emqx_session_router:delete_routes(SessionID, Subscriptions), emqx_session:set_field(is_persistent, false, Session). --spec mark_resume_begin(emqx_session:sessionID()) -> emqx_guid:guid(). +-spec mark_resume_begin(emqx_session:session_id()) -> emqx_guid:guid(). mark_resume_begin(SessionID) -> MarkerID = emqx_guid:gen(), put_session_message({SessionID, MarkerID, <<>>, ?MARKER}), @@ -396,12 +396,12 @@ do_mark_as_delivered(SessionID, [{deliver, STopic, Msg} | Left]) -> do_mark_as_delivered(_SessionID, []) -> ok. --spec pending(emqx_session:sessionID()) -> +-spec pending(emqx_session:session_id()) -> [{emqx_types:message(), STopic :: binary()}]. pending(SessionID) -> pending_messages_in_db(SessionID, []). --spec pending(emqx_session:sessionID(), MarkerIDs :: [emqx_guid:guid()]) -> +-spec pending(emqx_session:session_id(), MarkerIDs :: [emqx_guid:guid()]) -> [{emqx_types:message(), STopic :: binary()}]. pending(SessionID, MarkerIds) -> %% TODO: Handle lost MarkerIDs @@ -460,8 +460,8 @@ read_pending_msgs([], Acc) -> lists:reverse(Acc). %% The keys are ordered by -%% {sessionID(), <<>>, bin_timestamp(), ?ABANDONED} For abandoned sessions (clean started or expired). -%% {sessionID(), emqx_guid:guid(), STopic :: binary(), ?DELIVERED | ?UNDELIVERED | ?MARKER} +%% {session_id(), <<>>, bin_timestamp(), ?ABANDONED} For abandoned sessions (clean started or expired). +%% {session_id(), emqx_guid:guid(), STopic :: binary(), ?DELIVERED | ?UNDELIVERED | ?MARKER} %% where %% <<>> < emqx_guid:guid() %% <<>> < bin_timestamp() @@ -491,7 +491,7 @@ pending_messages({SessionID, PrevMsgId, PrevSTopic, PrevTag} = PrevKey, Acc, Mar false -> pending_messages(Key, Acc, MarkerIds); true -> pending_messages(Key, [{PrevMsgId, PrevSTopic} | Acc], MarkerIds) end; - %% Next sessionID or '$end_of_table' + %% Next session_id or '$end_of_table' _What -> case PrevTag =:= ?UNDELIVERED of false -> {lists:reverse(Acc), MarkerIds}; diff --git a/apps/emqx/src/persistent_session/emqx_persistent_session.hrl b/apps/emqx/src/persistent_session/emqx_persistent_session.hrl index eb4224116..5476d8daf 100644 --- a/apps/emqx/src/persistent_session/emqx_persistent_session.hrl +++ b/apps/emqx/src/persistent_session/emqx_persistent_session.hrl @@ -14,6 +14,8 @@ %% limitations under the License. %%-------------------------------------------------------------------- +-define(PERSISTENT_SESSION_SHARD, emqx_persistent_session_shard). + -record(session_store, { client_id :: binary(), expiry_interval :: non_neg_integer(), diff --git a/apps/emqx/src/persistent_session/emqx_persistent_session_gc.erl b/apps/emqx/src/persistent_session/emqx_persistent_session_gc.erl index a4c4e5422..4aa59cdef 100644 --- a/apps/emqx/src/persistent_session/emqx_persistent_session_gc.erl +++ b/apps/emqx/src/persistent_session/emqx_persistent_session_gc.erl @@ -56,6 +56,7 @@ start_link() -> init([]) -> process_flag(trap_exit, true), + mria_rlog:ensure_shard(?PERSISTENT_SESSION_SHARD), {ok, start_message_gc_timer(start_session_gc_timer(#{}))}. handle_call(_Request, _From, State) -> diff --git a/apps/emqx/src/proto/emqx_ds_proto_v1.erl b/apps/emqx/src/proto/emqx_ds_proto_v1.erl new file mode 100644 index 000000000..2283b7e4e --- /dev/null +++ b/apps/emqx/src/proto/emqx_ds_proto_v1.erl @@ -0,0 +1,49 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_ds_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + open_iterator/4 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +-define(TIMEOUT, 30_000). + +introduced_in() -> + %% FIXME + "5.3.0". + +-spec open_iterator( + [node()], + emqx_topic:words(), + emqx_ds:time(), + emqx_ds:iterator_id() +) -> + emqx_rpc:erpc_multicall(ok). +open_iterator(Nodes, TopicFilter, StartMS, IteratorID) -> + erpc:multicall( + Nodes, + emqx_persistent_session_ds, + do_open_iterator, + [TopicFilter, StartMS, IteratorID], + ?TIMEOUT + ). diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index 5e8bd4103..ddcb3234c 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -20,6 +20,7 @@ -export([stop/1]). -export([share_load_module/2]). +-export([node_name/1]). -define(APPS_CLUSTERING, [gen_rpc, mria, ekka]). @@ -83,7 +84,7 @@ when }. start(Nodes, ClusterOpts) -> NodeSpecs = mk_nodespecs(Nodes, ClusterOpts), - ct:pal("Starting cluster: ~p", [NodeSpecs]), + ct:pal("Starting cluster:\n ~p", [NodeSpecs]), % 1. Start bare nodes with only basic applications running _ = emqx_utils:pmap(fun start_node_init/1, NodeSpecs, ?TIMEOUT_NODE_START_MS), % 2. Start applications needed to enable clustering @@ -237,6 +238,8 @@ default_appspec(emqx_conf, Spec, _NodeSpecs) -> listeners => allocate_listener_ports([tcp, ssl, ws, wss], Spec) } }; +default_appspec(emqx, Spec = #{listeners := true}, _NodeSpecs) -> + #{config => #{listeners => allocate_listener_ports([tcp, ssl, ws, wss], Spec)}}; default_appspec(_App, _, _) -> #{}. diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index b818e3fec..62702b3bc 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -17,6 +17,8 @@ -module(emqx_persistent_messages_SUITE). -include_lib("stdlib/include/assert.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). -compile(export_all). -compile(nowarn_export_all). @@ -24,25 +26,38 @@ -define(NOW, (calendar:system_time_to_rfc3339(erlang:system_time(millisecond), [{unit, millisecond}])) ). +-define(DS_SHARD, <<"local">>). all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - {ok, _} = application:ensure_all_started(emqx_durable_storage), - ok = emqx_common_test_helpers:start_apps([], fun - (emqx) -> - emqx_common_test_helpers:boot_modules(all), - emqx_config:init_load(emqx_schema, <<"persistent_session_store.ds = true">>), - emqx_app:set_config_loader(?MODULE); - (_) -> - ok - end), + %% avoid inter-suite flakiness... + application:stop(emqx), + application:stop(emqx_durable_storage), + TCApps = emqx_cth_suite:start( + app_specs(), + #{work_dir => ?config(priv_dir, Config)} + ), + [{tc_apps, TCApps} | Config]. + +end_per_suite(Config) -> + TCApps = ?config(tc_apps, Config), + emqx_cth_suite:stop(TCApps), + ok. + +init_per_testcase(t_session_subscription_iterators, Config) -> + Cluster = cluster(), + Nodes = emqx_cth_cluster:start(Cluster, #{work_dir => ?config(priv_dir, Config)}), + [{nodes, Nodes} | Config]; +init_per_testcase(_TestCase, Config) -> Config. -end_per_suite(_Config) -> - emqx_common_test_helpers:stop_apps([]), - application:stop(emqx_durable_storage), +end_per_testcase(t_session_subscription_iterators, Config) -> + Nodes = ?config(nodes, Config), + ok = emqx_cth_cluster:stop(Nodes), + ok; +end_per_testcase(_TestCase, _Config) -> ok. t_messages_persisted(_Config) -> @@ -76,7 +91,7 @@ t_messages_persisted(_Config) -> ct:pal("Results = ~p", [Results]), - Persisted = consume(<<"local">>, {['#'], 0}), + Persisted = consume(?DS_SHARD, {['#'], 0}), ct:pal("Persisted = ~p", [Persisted]), @@ -88,6 +103,98 @@ t_messages_persisted(_Config) -> ok. +%% TODO: test quic and ws too +t_session_subscription_iterators(Config) -> + [Node1, Node2] = ?config(nodes, Config), + Port = get_mqtt_port(Node1, tcp), + Topic = <<"t/topic">>, + SubTopicFilter = <<"t/+">>, + AnotherTopic = <<"u/another-topic">>, + ClientId = <<"myclientid">>, + ?check_trace( + begin + [ + Payload1, + Payload2, + Payload3, + Payload4 + ] = lists:map( + fun(N) -> <<"hello", (integer_to_binary(N))/binary>> end, + lists:seq(1, 4) + ), + ct:pal("starting"), + {ok, Client} = emqtt:start_link([ + {port, Port}, + {clientid, ClientId}, + {proto_ver, v5} + ]), + {ok, _} = emqtt:connect(Client), + ct:pal("publishing 1"), + Message1 = emqx_message:make(Topic, Payload1), + publish(Node1, Message1), + receive_messages(1), + ct:pal("subscribing 1"), + {ok, _, [2]} = emqtt:subscribe(Client, SubTopicFilter, qos2), + ct:pal("publishing 2"), + Message2 = emqx_message:make(Topic, Payload2), + publish(Node1, Message2), + receive_messages(1), + ct:pal("subscribing 2"), + {ok, _, [1]} = emqtt:subscribe(Client, SubTopicFilter, qos1), + ct:pal("publishing 3"), + Message3 = emqx_message:make(Topic, Payload3), + publish(Node1, Message3), + receive_messages(1), + ct:pal("publishing 4"), + Message4 = emqx_message:make(AnotherTopic, Payload4), + publish(Node1, Message4), + IteratorIds = get_iterator_ids(Node1, ClientId), + emqtt:stop(Client), + #{ + messages => [Message1, Message2, Message3, Message4], + iterator_ids => IteratorIds + } + end, + fun(Results, Trace) -> + ct:pal("trace:\n ~p", [Trace]), + #{ + messages := [_Message1, Message2, Message3 | _], + iterator_ids := IteratorIds + } = Results, + case ?of_kind(ds_session_subscription_added, Trace) of + [] -> + %% Since `emqx_durable_storage' is a dependency of `emqx', it gets + %% compiled in "prod" mode when running emqx standalone tests. + ok; + [_ | _] -> + ?assertMatch( + [ + #{?snk_kind := ds_session_subscription_added}, + #{?snk_kind := ds_session_subscription_present} + ], + ?of_kind( + [ + ds_session_subscription_added, + ds_session_subscription_present + ], + Trace + ) + ), + ok + end, + ?assertMatch([_], IteratorIds), + [IteratorId] = IteratorIds, + ReplayMessages1 = erpc:call(Node1, fun() -> consume(?DS_SHARD, IteratorId) end), + ExpectedMessages = [Message2, Message3], + ?assertEqual(ExpectedMessages, ReplayMessages1), + %% Different DS shard + ReplayMessages2 = erpc:call(Node2, fun() -> consume(?DS_SHARD, IteratorId) end), + ?assertEqual([], ReplayMessages2), + ok + end + ), + ok. + %% connect(ClientId, CleanStart, EI) -> @@ -103,8 +210,11 @@ connect(ClientId, CleanStart, EI) -> {ok, _} = emqtt:connect(Client), Client. -consume(Shard, Replay) -> +consume(Shard, Replay = {_TopicFiler, _StartMS}) -> {ok, It} = emqx_ds_storage_layer:make_iterator(Shard, Replay), + consume(It); +consume(Shard, IteratorId) when is_binary(IteratorId) -> + {ok, It} = emqx_ds_storage_layer:restore_iterator(Shard, IteratorId), consume(It). consume(It) -> @@ -114,3 +224,59 @@ consume(It) -> none -> [] end. + +receive_messages(Count) -> + receive_messages(Count, []). + +receive_messages(0, Msgs) -> + Msgs; +receive_messages(Count, Msgs) -> + receive + {publish, Msg} -> + receive_messages(Count - 1, [Msg | Msgs]); + {deliver, _Topic, Msg} -> + receive_messages(Count - 1, [Msg | Msgs]); + _Other -> + receive_messages(Count, Msgs) + after 5000 -> + Msgs + end. + +publish(Node, Message) -> + erpc:call(Node, emqx, publish, [Message]). + +get_iterator_ids(Node, ClientId) -> + Channel = erpc:call(Node, fun() -> + [ConnPid] = emqx_cm:lookup_channels(ClientId), + sys:get_state(ConnPid) + end), + emqx_connection:info({channel, {session, iterators}}, Channel). + +app_specs() -> + [ + emqx_durable_storage, + {emqx, #{ + before_start => fun() -> + emqx_app:set_config_loader(?MODULE) + end, + config => #{persistent_session_store => #{ds => true}}, + override_env => [{boot_modules, [broker, listeners]}] + }} + ]. + +cluster() -> + Node1 = persistent_messages_SUITE1, + Spec = #{ + role => core, + join_to => emqx_cth_cluster:node_name(Node1), + listeners => true, + apps => app_specs() + }, + [ + {Node1, Spec}, + {persistent_messages_SUITE2, Spec} + ]. + +get_mqtt_port(Node, Type) -> + {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), + Port. diff --git a/apps/emqx/test/emqx_persistent_session_SUITE.erl b/apps/emqx/test/emqx_persistent_session_SUITE.erl index 07cfabc70..d8736b918 100644 --- a/apps/emqx/test/emqx_persistent_session_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_session_SUITE.erl @@ -267,6 +267,8 @@ receive_messages(Count, Msgs) -> receive {publish, Msg} -> receive_messages(Count - 1, [Msg | Msgs]); + {deliver, _Topic, Msg} -> + receive_messages(Count - 1, [Msg | Msgs]); _Other -> receive_messages(Count, Msgs) after 5000 -> @@ -373,6 +375,26 @@ do_publish(Payloads = [_ | _], PublishFun, WaitForUnregister) -> do_publish(Payload, PublishFun, WaitForUnregister) -> do_publish([Payload], PublishFun, WaitForUnregister). +get_replay_messages(ReplayID) -> + DSShard = <<"local">>, + case emqx_ds_storage_layer:restore_iterator(DSShard, ReplayID) of + {ok, It} -> + do_get_replay_messages(It, []); + Error -> + error({"error restoring iterator", #{error => Error, replay_id => ReplayID}}) + end. + +do_get_replay_messages(It, Acc) -> + case emqx_ds_storage_layer:next(It) of + {value, Val, NewIt} -> + Msg = erlang:binary_to_term(Val), + do_get_replay_messages(NewIt, [Msg | Acc]); + none -> + {ok, lists:reverse(Acc)}; + {error, Reason} -> + {error, Reason} + end. + %%-------------------------------------------------------------------- %% Test Cases %%-------------------------------------------------------------------- diff --git a/apps/emqx/test/emqx_proper_types.erl b/apps/emqx/test/emqx_proper_types.erl index 0e9d3032c..56e0b23b8 100644 --- a/apps/emqx/test/emqx_proper_types.erl +++ b/apps/emqx/test/emqx_proper_types.erl @@ -20,6 +20,7 @@ -include_lib("proper/include/proper.hrl"). -include("emqx.hrl"). +-include("emqx_session.hrl"). -include("emqx_access_control.hrl"). %% High level Types @@ -132,33 +133,23 @@ clientinfo() -> sessioninfo() -> ?LET( Session, - {session, clientid(), - % id - sessionid(), - % is_persistent - boolean(), - % subscriptions - subscriptions(), - % max_subscriptions - non_neg_integer(), - % upgrade_qos - boolean(), - % emqx_inflight:inflight() - inflight(), - % emqx_mqueue:mqueue() - mqueue(), - % next_pkt_id - packet_id(), - % retry_interval - safty_timeout(), - % awaiting_rel - awaiting_rel(), - % max_awaiting_rel - non_neg_integer(), - % await_rel_timeout - safty_timeout(), - % created_at - timestamp()}, + #session{ + clientid = clientid(), + id = sessionid(), + is_persistent = boolean(), + subscriptions = subscriptions(), + max_subscriptions = non_neg_integer(), + upgrade_qos = boolean(), + inflight = inflight(), + mqueue = mqueue(), + next_pkt_id = packet_id(), + retry_interval = safty_timeout(), + awaiting_rel = awaiting_rel(), + max_awaiting_rel = non_neg_integer(), + await_rel_timeout = safty_timeout(), + created_at = timestamp(), + iterators = [] + }, emqx_session:info(Session) ). diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 9eccf8c16..a69357975 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -15,6 +15,8 @@ %%-------------------------------------------------------------------- -module(emqx_ds). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + %% API: -export([ensure_shard/2]). %% Messages: @@ -56,7 +58,7 @@ -type iterator() :: term(). --opaque iterator_id() :: binary(). +-type iterator_id() :: binary(). %%-type session() :: #session{}. @@ -73,7 +75,8 @@ %% Timestamp %% Earliest possible timestamp is 0. -%% TODO granularity? +%% TODO granularity? Currently, we should always use micro second, as that's the unit we +%% use in emqx_guid. Otherwise, the iterators won't match the message timestamps. -type time() :: non_neg_integer(). %%================================================================================ @@ -129,11 +132,13 @@ session_open(ClientID) -> fun() -> case mnesia:read(?SESSION_TAB, ClientID) of [#session{iterators = Iterators}] -> - {false, ClientID, Iterators}; + IteratorIDs = maps:values(Iterators), + {false, ClientID, IteratorIDs}; [] -> - Session = #session{id = ClientID, iterators = []}, + Iterators = #{}, + Session = #session{id = ClientID, iterators = Iterators}, mnesia:write(?SESSION_TAB, Session, write), - {true, ClientID, []} + {true, ClientID, _IteratorIDs = []} end end ), @@ -160,10 +165,38 @@ session_suspend(_SessionId) -> %% @doc Called when a client subscribes to a topic. Idempotent. -spec session_add_iterator(session_id(), emqx_topic:words()) -> - {ok, iterator_id()} | {error, session_not_found}. -session_add_iterator(_SessionId, _TopicFilter) -> - %% TODO - {ok, <<"">>}. + {ok, iterator_id(), time(), _IsNew :: boolean()} | {error, session_not_found}. +session_add_iterator(DSSessionId, TopicFilter) -> + {atomic, Ret} = + mria:transaction( + ?DS_SHARD, + fun() -> + case mnesia:wread({?SESSION_TAB, DSSessionId}) of + [] -> + {error, session_not_found}; + [#session{iterators = #{TopicFilter := IteratorId}}] -> + StartMS = get_start_ms(IteratorId, DSSessionId), + ?tp( + ds_session_subscription_present, + #{iterator_id => IteratorId, session_id => DSSessionId} + ), + IsNew = false, + {ok, IteratorId, StartMS, IsNew}; + [#session{iterators = Iterators0} = Session0] -> + {IteratorId, StartMS} = new_iterator_id(DSSessionId), + Iterators = Iterators0#{TopicFilter => IteratorId}, + Session = Session0#session{iterators = Iterators}, + mnesia:write(?SESSION_TAB, Session, write), + ?tp( + ds_session_subscription_added, + #{iterator_id => IteratorId, session_id => DSSessionId} + ), + IsNew = true, + {ok, IteratorId, StartMS, IsNew} + end + end + ), + Ret. %% @doc Called when a client unsubscribes from a topic. Returns `true' %% if the session contained the subscription or `false' if it wasn't @@ -201,3 +234,14 @@ iterator_stats() -> %%================================================================================ %% Internal functions %%================================================================================ + +-spec new_iterator_id(session_id()) -> {iterator_id(), time()}. +new_iterator_id(DSSessionId) -> + NowMS = erlang:system_time(microsecond), + NowMSBin = integer_to_binary(NowMS), + {<>, NowMS}. + +-spec get_start_ms(iterator_id(), emqx_session:session_id()) -> time(). +get_start_ms(IteratorId, SessionId) -> + <> = IteratorId, + binary_to_integer(StartMSBin). diff --git a/apps/emqx_durable_storage/src/emqx_ds_int.hrl b/apps/emqx_durable_storage/src/emqx_ds_int.hrl index 96688ede6..fa11a6600 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_int.hrl +++ b/apps/emqx_durable_storage/src/emqx_ds_int.hrl @@ -21,7 +21,7 @@ -record(session, { id :: emqx_ds:session_id(), - iterators :: [{emqx_topic:words(), emqx_ds:iterator_id()}] + iterators :: #{emqx_topic:words() => emqx_ds:iterator_id()} }). -endif. diff --git a/apps/emqx_durable_storage/src/emqx_ds_replay.erl b/apps/emqx_durable_storage/src/emqx_ds_replay.erl index a66cee7fd..b9ffa32ac 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_replay.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_replay.erl @@ -15,7 +15,7 @@ -type replay_id() :: binary(). -type replay() :: { - _TopicFilter :: emqx_ds:topic(), + _TopicFilter :: emqx_ds:words(), _StartTime :: emqx_ds:time() }. diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 7c73dafaf..ac4649d94 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -13,7 +13,13 @@ -export([make_iterator/2, next/1]). --export([preserve_iterator/2, restore_iterator/2, discard_iterator/2]). +-export([ + preserve_iterator/2, + restore_iterator/2, + discard_iterator/2, + is_iterator_present/2, + discard_iterator_prefix/2 +]). %% behaviour callbacks: -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). @@ -160,10 +166,10 @@ next(It = #it{module = Mod, data = ItData}) -> end end. --spec preserve_iterator(iterator(), emqx_ds_replay:replay_id()) -> +-spec preserve_iterator(iterator(), emqx_ds:iterator_id()) -> ok | {error, _TODO}. -preserve_iterator(It = #it{}, ReplayID) -> - iterator_put_state(ReplayID, It). +preserve_iterator(It = #it{}, IteratorID) -> + iterator_put_state(IteratorID, It). -spec restore_iterator(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> {ok, iterator()} | {error, _TODO}. @@ -177,11 +183,27 @@ restore_iterator(Shard, ReplayID) -> Error end. --spec discard_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> +-spec is_iterator_present(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> + boolean(). +is_iterator_present(Shard, ReplayID) -> + %% TODO: use keyMayExist after added to wrapper? + case iterator_get_state(Shard, ReplayID) of + {ok, _} -> + true; + _ -> + false + end. + +-spec discard_iterator(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> ok | {error, _TODO}. discard_iterator(Shard, ReplayID) -> iterator_delete(Shard, ReplayID). +-spec discard_iterator_prefix(emqx_ds:shard(), binary()) -> + ok | {error, _TODO}. +discard_iterator_prefix(Shard, KeyPrefix) -> + do_discard_iterator_prefix(Shard, KeyPrefix). + %%================================================================================ %% behaviour callbacks %%================================================================================ @@ -391,6 +413,32 @@ restore_iterator_state( It = #it{shard = Shard, gen = Gen, replay = {TopicFilter, StartTime}}, open_restore_iterator(meta_get_gen(Shard, Gen), It, State). +do_discard_iterator_prefix(Shard, KeyPrefix) -> + #db{handle = Handle, cf_iterator = CF} = meta_lookup(Shard, db), + case rocksdb:iterator(Handle, CF, ?ITERATION_READ_OPTS) of + {ok, It} -> + NextAction = {seek, KeyPrefix}, + do_discard_iterator_prefix(Handle, CF, It, KeyPrefix, NextAction); + Error -> + Error + end. + +do_discard_iterator_prefix(DBHandle, CF, It, KeyPrefix, NextAction) -> + case rocksdb:iterator_move(It, NextAction) of + {ok, K = <>, _V} -> + ok = rocksdb:delete(DBHandle, CF, K, ?ITERATION_WRITE_OPTS), + do_discard_iterator_prefix(DBHandle, CF, It, KeyPrefix, next); + {ok, _K, _V} -> + ok = rocksdb:iterator_close(It), + ok; + {error, invalid_iterator} -> + ok = rocksdb:iterator_close(It), + ok; + Error -> + ok = rocksdb:iterator_close(It), + Error + end. + %% Functions for dealing with the metadata stored persistently in rocksdb -define(CURRENT_GEN, <<"current">>). diff --git a/apps/emqx_durable_storage/src/emqx_durable_storage.app.src b/apps/emqx_durable_storage/src/emqx_durable_storage.app.src index ecf9dd270..367ade691 100644 --- a/apps/emqx_durable_storage/src/emqx_durable_storage.app.src +++ b/apps/emqx_durable_storage/src/emqx_durable_storage.app.src @@ -2,7 +2,7 @@ {application, emqx_durable_storage, [ {description, "Message persistence and subscription replays for EMQX"}, % strict semver, bump manually! - {vsn, "0.1.2"}, + {vsn, "0.1.3"}, {modules, []}, {registered, []}, {applications, [kernel, stdlib, rocksdb, gproc, mria]}, diff --git a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl index bbe16f518..10431eb1a 100644 --- a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl +++ b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl @@ -14,6 +14,8 @@ -opaque t() :: ets:tid(). +-export_type([t/0]). + -spec open() -> t(). open() -> ets:new(?MODULE, [ordered_set, {keypos, 1}]). diff --git a/apps/emqx_management/src/emqx_mgmt_api_clients.erl b/apps/emqx_management/src/emqx_mgmt_api_clients.erl index 18ac65ae6..d9b6d9bd5 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_clients.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_clients.erl @@ -927,7 +927,7 @@ format_channel_info(WhichNode, {_, ClientInfo0, ClientStats}) -> retry_interval, upgrade_qos, zone, - %% sessionID, defined in emqx_session.erl + %% session_id, defined in emqx_session.erl id ], TimesKeys = [created_at, connected_at, disconnected_at], From e8d7bb9a6767898e4aee6a3d2fbf977813f852dd Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 10:05:00 -0300 Subject: [PATCH 13/85] refactor: rename module --- apps/emqx/priv/bpapi.versions | 2 +- apps/emqx/src/emqx_persistent_session_ds.erl | 2 +- ..._ds_proto_v1.erl => emqx_persistent_session_ds_proto_v1.erl} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename apps/emqx/src/proto/{emqx_ds_proto_v1.erl => emqx_persistent_session_ds_proto_v1.erl} (96%) diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index b6a4c6e7a..68d42ee01 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -15,7 +15,6 @@ {emqx_conf,3}. {emqx_dashboard,1}. {emqx_delayed,1}. -{emqx_ds,1}. {emqx_eviction_agent,1}. {emqx_exhook,1}. {emqx_ft_storage_exporter_fs,1}. @@ -42,6 +41,7 @@ {emqx_node_rebalance_evacuation,1}. {emqx_node_rebalance_status,1}. {emqx_persistent_session,1}. +{emqx_persistent_session_ds,1}. {emqx_plugins,1}. {emqx_prometheus,1}. {emqx_resource,1}. diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 7e9db4707..7d118ccb4 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -106,7 +106,7 @@ add_subscription(TopicFilterBin, DSSessionID) -> -spec open_iterator_on_all_nodes(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) -> Nodes = emqx:running_nodes(), - Results = emqx_ds_proto_v1:open_iterator(Nodes, TopicFilter, StartMS, IteratorID), + Results = emqx_persistent_session_ds_proto_v1:open_iterator(Nodes, TopicFilter, StartMS, IteratorID), %% TODO: handle errors true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), ok. diff --git a/apps/emqx/src/proto/emqx_ds_proto_v1.erl b/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl similarity index 96% rename from apps/emqx/src/proto/emqx_ds_proto_v1.erl rename to apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl index 2283b7e4e..cd348cc2c 100644 --- a/apps/emqx/src/proto/emqx_ds_proto_v1.erl +++ b/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl @@ -14,7 +14,7 @@ %% limitations under the License. %%-------------------------------------------------------------------- --module(emqx_ds_proto_v1). +-module(emqx_persistent_session_ds_proto_v1). -behaviour(emqx_bpapi). From 33ddbe80ad5e571b682a5a3f001d2fa42169bb01 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 10:05:26 -0300 Subject: [PATCH 14/85] refactor: remove persistence leakeage from emqx_cm level --- apps/emqx/src/emqx_cm.erl | 12 ++---------- apps/emqx/src/emqx_session.erl | 14 +++++++++++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index b98222959..ae6efb89c 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -301,17 +301,9 @@ open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) -> emqx_cm_locker:trans(ClientId, ResumeStart). create_session(ClientInfo, ConnInfo) -> + #{clientid := ClientId} = ClientInfo, Options = get_session_confs(ClientInfo, ConnInfo), - #{clientid := ClientID} = ClientInfo, - Session0 = emqx_session:init(Options), - IteratorIDs = - case emqx_persistent_session_ds:open_session(ClientID) of - {skipped, disabled} -> - []; - {_IsNew, _DSSessionID, Iterators0} -> - Iterators0 - end, - Session = Session0#session{iterators = IteratorIDs}, + Session = emqx_session:init_and_open(ClientId, Options), ok = emqx_metrics:inc('session.created'), ok = emqx_hooks:run('session.created', [ClientInfo, emqx_session:info(Session)]), Session. diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 8f1933fc1..9b877ae44 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -60,7 +60,7 @@ unpersist/1 ]). --export([init/1]). +-export([init/1, init_and_open/2]). -export([ info/1, @@ -166,6 +166,18 @@ %% Init a Session %%-------------------------------------------------------------------- +-spec init_and_open(emqx_types:clientid(), options()) -> session(). +init_and_open(ClientID, Options) -> + Session0 = emqx_session:init(Options), + IteratorIDs = + case emqx_persistent_session_ds:open_session(ClientID) of + {skipped, disabled} -> + []; + {_IsNew, _DSSessionID, Iterators0} -> + Iterators0 + end, + Session0#session{iterators = IteratorIDs}. + -spec init(options()) -> session(). init(Opts) -> MaxInflight = maps:get(max_inflight, Opts), From c28c6d1b7e62ced8f792ea99e9adcb89a1088bfd Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 10:06:39 -0300 Subject: [PATCH 15/85] fix: ensure iterator is opened --- apps/emqx/src/emqx_persistent_session_ds.erl | 24 +++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 7d118ccb4..564c99c00 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -93,12 +93,7 @@ add_subscription(TopicFilterBin, DSSessionID) -> {ok, IteratorID, StartMS, IsNew} = emqx_ds:session_add_iterator( DSSessionID, TopicFilter ), - case IsNew of - true -> - ok = open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID); - false -> - ok - end, + ok = open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID), {ok, IteratorID, IsNew} end ). @@ -106,7 +101,9 @@ add_subscription(TopicFilterBin, DSSessionID) -> -spec open_iterator_on_all_nodes(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) -> Nodes = emqx:running_nodes(), - Results = emqx_persistent_session_ds_proto_v1:open_iterator(Nodes, TopicFilter, StartMS, IteratorID), + Results = emqx_persistent_session_ds_proto_v1:open_iterator( + Nodes, TopicFilter, StartMS, IteratorID + ), %% TODO: handle errors true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), ok. @@ -114,10 +111,15 @@ open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) -> -spec do_open_iterator(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. do_open_iterator(TopicFilter, StartMS, IteratorID) -> Replay = {TopicFilter, StartMS}, - %% FIXME: choose DS shard based on ...? - {ok, It} = emqx_ds_storage_layer:make_iterator(?DS_SHARD, Replay), - ok = emqx_ds_storage_layer:preserve_iterator(It, IteratorID), - ok. + case emqx_ds_storage_layer:is_iterator_present(?DS_SHARD, IteratorID) of + true -> + {ok, _It} = emqx_ds_storage_layer:restore_iterator(?DS_SHARD, IteratorID), + ok; + false -> + {ok, It} = emqx_ds_storage_layer:make_iterator(?DS_SHARD, Replay), + ok = emqx_ds_storage_layer:preserve_iterator(It, IteratorID), + ok + end. %% From 8eab389ae1d018961445aba6597e22ead954647f Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 10:12:41 -0300 Subject: [PATCH 16/85] perf: avoid unnecessary transaction --- apps/emqx_durable_storage/src/emqx_ds.erl | 27 +++++++++-------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index a69357975..654451561 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -126,23 +126,16 @@ message_stats() -> %% the broker. -spec session_open(emqx_types:clientid()) -> {_New :: boolean(), session_id(), [iterator_id()]}. session_open(ClientID) -> - {atomic, Ret} = - mria:transaction( - ?DS_SHARD, - fun() -> - case mnesia:read(?SESSION_TAB, ClientID) of - [#session{iterators = Iterators}] -> - IteratorIDs = maps:values(Iterators), - {false, ClientID, IteratorIDs}; - [] -> - Iterators = #{}, - Session = #session{id = ClientID, iterators = Iterators}, - mnesia:write(?SESSION_TAB, Session, write), - {true, ClientID, _IteratorIDs = []} - end - end - ), - Ret. + case mnesia:dirty_read(?SESSION_TAB, ClientID) of + [#session{iterators = Iterators}] -> + IteratorIDs = maps:values(Iterators), + {false, ClientID, IteratorIDs}; + [] -> + Iterators = #{}, + Session = #session{id = ClientID, iterators = Iterators}, + mria:dirty_write(?SESSION_TAB, Session), + {true, ClientID, _IteratorIDs = []} + end. %% @doc Called when a client reconnects with `clean session=true' or %% during session GC From 3239f5ac5bf7e1593f666881e1e28048aa3677c1 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 10:43:16 -0300 Subject: [PATCH 17/85] feat: rm unnecessary transactions, use separate table for iterator references --- apps/emqx_durable_storage/src/emqx_ds.erl | 72 ++++++++----------- apps/emqx_durable_storage/src/emqx_ds_app.erl | 13 +++- apps/emqx_durable_storage/src/emqx_ds_int.hrl | 7 ++ 3 files changed, 47 insertions(+), 45 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 654451561..297c7d857 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -141,12 +141,7 @@ session_open(ClientID) -> %% during session GC -spec session_drop(emqx_types:clientid()) -> ok. session_drop(ClientID) -> - {atomic, ok} = mria:transaction( - ?DS_SHARD, - fun() -> - mnesia:delete({?SESSION_TAB, ClientID}) - end - ), + ok = mria:dirty_delete({?SESSION_TAB, ClientID}), ok. %% @doc Called when a client disconnects. This function terminates all @@ -158,38 +153,32 @@ session_suspend(_SessionId) -> %% @doc Called when a client subscribes to a topic. Idempotent. -spec session_add_iterator(session_id(), emqx_topic:words()) -> - {ok, iterator_id(), time(), _IsNew :: boolean()} | {error, session_not_found}. + {ok, iterator_id(), time(), _IsNew :: boolean()}. session_add_iterator(DSSessionId, TopicFilter) -> - {atomic, Ret} = - mria:transaction( - ?DS_SHARD, - fun() -> - case mnesia:wread({?SESSION_TAB, DSSessionId}) of - [] -> - {error, session_not_found}; - [#session{iterators = #{TopicFilter := IteratorId}}] -> - StartMS = get_start_ms(IteratorId, DSSessionId), - ?tp( - ds_session_subscription_present, - #{iterator_id => IteratorId, session_id => DSSessionId} - ), - IsNew = false, - {ok, IteratorId, StartMS, IsNew}; - [#session{iterators = Iterators0} = Session0] -> - {IteratorId, StartMS} = new_iterator_id(DSSessionId), - Iterators = Iterators0#{TopicFilter => IteratorId}, - Session = Session0#session{iterators = Iterators}, - mnesia:write(?SESSION_TAB, Session, write), - ?tp( - ds_session_subscription_added, - #{iterator_id => IteratorId, session_id => DSSessionId} - ), - IsNew = true, - {ok, IteratorId, StartMS, IsNew} - end - end - ), - Ret. + IteratorRefId = {DSSessionId, TopicFilter}, + case mnesia:dirty_read(?ITERATOR_REF_TAB, IteratorRefId) of + [] -> + {IteratorId, StartMS} = new_iterator_id(DSSessionId), + IteratorRef = #iterator_ref{ + ref_id = IteratorRefId, + it_id = IteratorId, + start_time = StartMS + }, + ok = mria:dirty_write(?ITERATOR_REF_TAB, IteratorRef), + ?tp( + ds_session_subscription_added, + #{iterator_id => IteratorId, session_id => DSSessionId} + ), + IsNew = true, + {ok, IteratorId, StartMS, IsNew}; + [#iterator_ref{it_id = IteratorId, start_time = StartMS}] -> + ?tp( + ds_session_subscription_present, + #{iterator_id => IteratorId, session_id => DSSessionId} + ), + IsNew = false, + {ok, IteratorId, StartMS, IsNew} + end. %% @doc Called when a client unsubscribes from a topic. Returns `true' %% if the session contained the subscription or `false' if it wasn't @@ -231,10 +220,5 @@ iterator_stats() -> -spec new_iterator_id(session_id()) -> {iterator_id(), time()}. new_iterator_id(DSSessionId) -> NowMS = erlang:system_time(microsecond), - NowMSBin = integer_to_binary(NowMS), - {<>, NowMS}. - --spec get_start_ms(iterator_id(), emqx_session:session_id()) -> time(). -get_start_ms(IteratorId, SessionId) -> - <> = IteratorId, - binary_to_integer(StartMSBin). + IteratorId = <>, + {IteratorId, NowMS}. diff --git a/apps/emqx_durable_storage/src/emqx_ds_app.erl b/apps/emqx_durable_storage/src/emqx_ds_app.erl index 216e979ee..cbcdb0b8c 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_app.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_app.erl @@ -25,7 +25,18 @@ init_mnesia() -> {record_name, session}, {attributes, record_info(fields, session)} ] - ). + ), + ok = mria:create_table( + ?ITERATOR_REF_TAB, + [ + {rlog_shard, ?DS_SHARD}, + {type, ordered_set}, + {storage, storage()}, + {record_name, iterator_ref}, + {attributes, record_info(fields, iterator_ref)} + ] + ), + ok. storage() -> case mria:rocksdb_backend_available() of diff --git a/apps/emqx_durable_storage/src/emqx_ds_int.hrl b/apps/emqx_durable_storage/src/emqx_ds_int.hrl index fa11a6600..55223068f 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_int.hrl +++ b/apps/emqx_durable_storage/src/emqx_ds_int.hrl @@ -17,6 +17,7 @@ -define(EMQX_DS_HRL, true). -define(SESSION_TAB, emqx_ds_session). +-define(ITERATOR_REF_TAB, emqx_ds_iterator_ref). -define(DS_SHARD, emqx_ds_shard). -record(session, { @@ -24,4 +25,10 @@ iterators :: #{emqx_topic:words() => emqx_ds:iterator_id()} }). +-record(iterator_ref, { + ref_id :: {emqx_ds:session_id(), emqx_topic:words()}, + it_id :: emqx_ds:iterator_id(), + start_time :: emqx_ds:time() +}). + -endif. From e4e88ebf36b57a43f2a4327578971f086f6601f1 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 14:11:28 -0300 Subject: [PATCH 18/85] test: add scenario for node stopping midway during subscribe --- apps/emqx/src/emqx_persistent_session_ds.erl | 16 +- apps/emqx/test/emqx_cth_cluster.erl | 18 +- apps/emqx/test/emqx_cth_suite.erl | 2 + .../test/emqx_persistent_messages_SUITE.erl | 8 + .../src/emqx_ds_storage_layer.erl | 32 +++- .../test/emqx_ds_SUITE.erl | 178 ++++++++++++++++++ 6 files changed, 241 insertions(+), 13 deletions(-) create mode 100644 apps/emqx_durable_storage/test/emqx_ds_SUITE.erl diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 564c99c00..0120f09d4 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -93,13 +93,27 @@ add_subscription(TopicFilterBin, DSSessionID) -> {ok, IteratorID, StartMS, IsNew} = emqx_ds:session_add_iterator( DSSessionID, TopicFilter ), - ok = open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID), + Ctx = #{ + iterator_id => IteratorID, + start_time => StartMS, + is_new => IsNew + }, + ?tp(persistent_session_ds_iterator_added, Ctx), + ?tp_span( + persistent_session_ds_open_iterators, + Ctx, + ok = open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) + ), {ok, IteratorID, IsNew} end ). -spec open_iterator_on_all_nodes(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) -> + ?tp(persistent_session_ds_will_open_iterators, #{ + iterator_id => IteratorID, + start_time => StartMS + }), Nodes = emqx:running_nodes(), Results = emqx_persistent_session_ds_proto_v1:open_iterator( Nodes, TopicFilter, StartMS, IteratorID diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index ddcb3234c..cbbad0aa2 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -17,7 +17,7 @@ -module(emqx_cth_cluster). -export([start/2]). --export([stop/1]). +-export([stop/1, stop_node/1]). -export([share_load_module/2]). -export([node_name/1]). @@ -80,7 +80,12 @@ when %% Working directory %% Everything a test produces should go here. Each node's stuff should go in its %% own directory. - work_dir := file:name() + work_dir := file:name(), + %% Usually, we want to ensure the node / test suite starts from a clean slate. + %% However, sometimes, we may want to test restarting a node. For such + %% situations, we need to disable this check to allow resuming from an existing + %% state. + skip_clean_suite_state_check => boolean() }. start(Nodes, ClusterOpts) -> NodeSpecs = mk_nodespecs(Nodes, ClusterOpts), @@ -124,12 +129,14 @@ mk_init_nodespec(N, Name, NodeOpts, ClusterOpts) -> Node = node_name(Name), BasePort = base_port(N), WorkDir = maps:get(work_dir, ClusterOpts), + SkipCleanSuiteStateCheck = maps:get(skip_clean_suite_state_check, ClusterOpts, false), Defaults = #{ name => Node, role => core, apps => [], base_port => BasePort, work_dir => filename:join([WorkDir, Node]), + skip_clean_suite_state_check => SkipCleanSuiteStateCheck, driver => ct_slave }, maps:merge(Defaults, NodeOpts). @@ -288,17 +295,20 @@ load_apps(Node, #{apps := Apps}) -> erpc:call(Node, emqx_cth_suite, load_apps, [Apps]). start_apps_clustering(Node, #{apps := Apps} = Spec) -> - SuiteOpts = maps:with([work_dir], Spec), + SuiteOpts = suite_opts(Spec), AppsClustering = [lists:keyfind(App, 1, Apps) || App <- ?APPS_CLUSTERING], _Started = erpc:call(Node, emqx_cth_suite, start, [AppsClustering, SuiteOpts]), ok. start_apps(Node, #{apps := Apps} = Spec) -> - SuiteOpts = maps:with([work_dir], Spec), + SuiteOpts = suite_opts(Spec), AppsRest = [AppSpec || AppSpec = {App, _} <- Apps, not lists:member(App, ?APPS_CLUSTERING)], _Started = erpc:call(Node, emqx_cth_suite, start_apps, [AppsRest, SuiteOpts]), ok. +suite_opts(Spec) -> + maps:with([work_dir, skip_clean_suite_state_check], Spec). + maybe_join_cluster(_Node, #{role := replicant}) -> ok; maybe_join_cluster(Node, Spec) -> diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index 9b3e58da4..dbe9423da 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -358,6 +358,8 @@ stop_apps(Apps) -> %% +verify_clean_suite_state(#{skip_clean_suite_state_check := true}) -> + ok; verify_clean_suite_state(#{work_dir := WorkDir}) -> {ok, []} = file:list_dir(WorkDir), none = persistent_term:get(?EMQX_AUTHENTICATION_SCHEMA_MODULE_PT_KEY, none), diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index 62702b3bc..b669be889 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -183,6 +183,8 @@ t_session_subscription_iterators(Config) -> ok end, ?assertMatch([_], IteratorIds), + ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), + ?assertMatch({ok, [_]}, get_all_iterator_ids(Node2)), [IteratorId] = IteratorIds, ReplayMessages1 = erpc:call(Node1, fun() -> consume(?DS_SHARD, IteratorId) end), ExpectedMessages = [Message2, Message3], @@ -280,3 +282,9 @@ cluster() -> get_mqtt_port(Node, Type) -> {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), Port. + +get_all_iterator_ids(Node) -> + Fn = fun(K, _V, Acc) -> [K | Acc] end, + erpc:call(Node, fun() -> + emqx_ds_storage_layer:foldl_iterator_prefix(?DS_SHARD, <<>>, Fn, []) + end). diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index ac4649d94..9bc7924e8 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -18,7 +18,8 @@ restore_iterator/2, discard_iterator/2, is_iterator_present/2, - discard_iterator_prefix/2 + discard_iterator_prefix/2, + foldl_iterator_prefix/4 ]). %% behaviour callbacks: @@ -204,6 +205,16 @@ discard_iterator(Shard, ReplayID) -> discard_iterator_prefix(Shard, KeyPrefix) -> do_discard_iterator_prefix(Shard, KeyPrefix). +-spec foldl_iterator_prefix( + emqx_ds:shard(), + binary(), + fun((_Key :: binary(), _Value :: binary(), Acc) -> Acc), + Acc +) -> {ok, Acc} | {error, _TODO} when + Acc :: term(). +foldl_iterator_prefix(Shard, KeyPrefix, Fn, Acc) -> + do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, Acc). + %%================================================================================ %% behaviour callbacks %%================================================================================ @@ -414,26 +425,31 @@ restore_iterator_state( open_restore_iterator(meta_get_gen(Shard, Gen), It, State). do_discard_iterator_prefix(Shard, KeyPrefix) -> + #db{handle = DBHandle, cf_iterator = CF} = meta_lookup(Shard, db), + Fn = fun(K, _V, _Acc) -> ok = rocksdb:delete(DBHandle, CF, K, ?ITERATION_WRITE_OPTS) end, + do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, ok). + +do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, Acc) -> #db{handle = Handle, cf_iterator = CF} = meta_lookup(Shard, db), case rocksdb:iterator(Handle, CF, ?ITERATION_READ_OPTS) of {ok, It} -> NextAction = {seek, KeyPrefix}, - do_discard_iterator_prefix(Handle, CF, It, KeyPrefix, NextAction); + do_foldl_iterator_prefix(Handle, CF, It, KeyPrefix, NextAction, Fn, Acc); Error -> Error end. -do_discard_iterator_prefix(DBHandle, CF, It, KeyPrefix, NextAction) -> +do_foldl_iterator_prefix(DBHandle, CF, It, KeyPrefix, NextAction, Fn, Acc) -> case rocksdb:iterator_move(It, NextAction) of - {ok, K = <>, _V} -> - ok = rocksdb:delete(DBHandle, CF, K, ?ITERATION_WRITE_OPTS), - do_discard_iterator_prefix(DBHandle, CF, It, KeyPrefix, next); + {ok, K = <>, V} -> + NewAcc = Fn(K, V, Acc), + do_foldl_iterator_prefix(DBHandle, CF, It, KeyPrefix, next, Fn, NewAcc); {ok, _K, _V} -> ok = rocksdb:iterator_close(It), - ok; + {ok, Acc}; {error, invalid_iterator} -> ok = rocksdb:iterator_close(It), - ok; + {ok, Acc}; Error -> ok = rocksdb:iterator_close(It), Error diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl new file mode 100644 index 000000000..8a2d18c0d --- /dev/null +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -0,0 +1,178 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_ds_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("stdlib/include/assert.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-define(DS_SHARD, <<"local">>). + +%%------------------------------------------------------------------------------ +%% CT boilerplate +%%------------------------------------------------------------------------------ + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + %% avoid inter-suite flakiness... + application:stop(emqx), + application:stop(emqx_durable_storage), + TCApps = emqx_cth_suite:start( + app_specs(), + #{work_dir => ?config(priv_dir, Config)} + ), + [{tc_apps, TCApps} | Config]. + +end_per_suite(Config) -> + TCApps = ?config(tc_apps, Config), + emqx_cth_suite:stop(TCApps), + ok. + +init_per_testcase(t_session_subscription_idempotency, Config) -> + Cluster = cluster(#{n => 1}), + Nodes = emqx_cth_cluster:start(Cluster, #{work_dir => ?config(priv_dir, Config)}), + [{cluster, Cluster}, {nodes, Nodes} | Config]; +init_per_testcase(_TestCase, Config) -> + Config. + +end_per_testcase(t_session_subscription_idempotency, Config) -> + Nodes = ?config(nodes, Config), + ok = emqx_cth_cluster:stop(Nodes), + ok; +end_per_testcase(_TestCase, _Config) -> + ok. + +%%------------------------------------------------------------------------------ +%% Helper fns +%%------------------------------------------------------------------------------ + +cluster(#{n := N}) -> + Node1 = ds_SUITE1, + Spec = #{ + role => core, + join_to => emqx_cth_cluster:node_name(Node1), + listeners => true, + apps => app_specs() + }, + [ + {Node1, Spec} + | lists:map( + fun(M) -> + Name = binary_to_atom(<<"ds_SUITE", (integer_to_binary(M))/binary>>), + {Name, Spec} + end, + lists:seq(2, N) + ) + ]. + +app_specs() -> + [ + emqx_durable_storage, + {emqx, #{ + before_start => fun() -> + emqx_app:set_config_loader(?MODULE) + end, + config => #{persistent_session_store => #{ds => true}}, + override_env => [{boot_modules, [broker, listeners]}] + }} + ]. + +get_mqtt_port(Node, Type) -> + {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), + Port. + +get_all_iterator_ids(Node) -> + Fn = fun(K, _V, Acc) -> [K | Acc] end, + erpc:call(Node, fun() -> + emqx_ds_storage_layer:foldl_iterator_prefix(?DS_SHARD, <<>>, Fn, []) + end). + +%%------------------------------------------------------------------------------ +%% Testcases +%%------------------------------------------------------------------------------ + +t_session_subscription_idempotency(Config) -> + Cluster = ?config(cluster, Config), + [Node1] = ?config(nodes, Config), + Port = get_mqtt_port(Node1, tcp), + SubTopicFilter = <<"t/+">>, + ClientId = <<"myclientid">>, + ?check_trace( + begin + ?force_ordering( + #{?snk_kind := persistent_session_ds_iterator_added}, + _NEvents0 = 1, + #{?snk_kind := will_restart_node}, + _Guard0 = true + ), + ?force_ordering( + #{?snk_kind := restarted_node}, + _NEvents1 = 1, + #{?snk_kind := persistent_session_ds_open_iterators, ?snk_span := start}, + _Guard1 = true + ), + + spawn_link(fun() -> + ?tp(will_restart_node, #{}), + ct:pal("stopping node ~p", [Node1]), + ok = emqx_cth_cluster:stop_node(Node1), + ct:pal("stopped node ~p; restarting...", [Node1]), + [Node1] = emqx_cth_cluster:start(Cluster, #{ + work_dir => ?config(priv_dir, Config), + skip_clean_suite_state_check => true + }), + ct:pal("node ~p restarted", [Node1]), + ?tp(restarted_node, #{}), + ok + end), + + ct:pal("starting 1"), + {ok, Client0} = emqtt:start_link([ + {port, Port}, + {clientid, ClientId}, + {proto_ver, v5} + ]), + {ok, _} = emqtt:connect(Client0), + ct:pal("subscribing 1"), + process_flag(trap_exit, true), + catch emqtt:subscribe(Client0, SubTopicFilter, qos2), + receive + {'EXIT', {shutdown, _}} -> + ok + after 0 -> ok + end, + process_flag(trap_exit, false), + + {ok, _} = ?block_until(#{?snk_kind := restarted_node}, 15_000), + ct:pal("starting 2"), + {ok, Client1} = emqtt:start_link([ + {port, Port}, + {clientid, ClientId}, + {proto_ver, v5} + ]), + {ok, _} = emqtt:connect(Client1), + ct:pal("subscribing 2"), + {ok, _, [2]} = emqtt:subscribe(Client1, SubTopicFilter, qos2), + + ok = emqtt:stop(Client1), + + ok + end, + fun(Trace) -> + ct:pal("trace:\n ~p", [Trace]), + %% Exactly one iterator should have been opened. + ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), + ?assertMatch( + {_IsNew = false, ClientId, _}, + erpc:call(Node1, emqx_ds, session_open, [ClientId]) + ), + ok + end + ), + ok. From 021755b82bb1d5293620cacd6b324e669c9b2555 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 11 Aug 2023 15:08:38 -0300 Subject: [PATCH 19/85] refactor: rm iterators from DS `#session{}` record --- apps/emqx/include/emqx_session.hrl | 4 +-- apps/emqx/src/emqx_persistent_session_ds.erl | 2 +- apps/emqx/src/emqx_session.erl | 25 ++++-------------- .../test/emqx_persistent_messages_SUITE.erl | 17 ++++-------- apps/emqx/test/emqx_proper_types.erl | 3 +-- apps/emqx_durable_storage/src/emqx_ds.erl | 13 +++++----- apps/emqx_durable_storage/src/emqx_ds_int.hrl | 4 ++- .../src/emqx_ds_storage_layer.erl | 26 +++++++++++++++++-- .../test/emqx_ds_SUITE.erl | 2 +- 9 files changed, 47 insertions(+), 49 deletions(-) diff --git a/apps/emqx/include/emqx_session.hrl b/apps/emqx/include/emqx_session.hrl index fba4cf911..3fea157ed 100644 --- a/apps/emqx/include/emqx_session.hrl +++ b/apps/emqx/include/emqx_session.hrl @@ -49,9 +49,7 @@ %% Awaiting PUBREL Timeout (Unit: millisecond) await_rel_timeout :: timeout(), %% Created at - created_at :: pos_integer(), - %% Durable storage iterators for existing subscriptions - iterators = [] :: [emqx_ds_replay:replay_id()] + created_at :: pos_integer() }). -endif. diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 0120f09d4..19e11b1a3 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -85,7 +85,7 @@ open_session(ClientID) -> ?WHEN_ENABLED(emqx_ds:session_open(ClientID)). -spec add_subscription(emqx_types:topic(), emqx_ds:session_id()) -> - {ok, emqx_ds:iterator_id(), _IsNew :: boolean()} | {skipped, disabled}. + {ok, emqx_ds:iterator_id(), IsNew :: boolean()} | {skipped, disabled}. add_subscription(TopicFilterBin, DSSessionID) -> ?WHEN_ENABLED( begin diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 9b877ae44..32c98290a 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -169,14 +169,8 @@ -spec init_and_open(emqx_types:clientid(), options()) -> session(). init_and_open(ClientID, Options) -> Session0 = emqx_session:init(Options), - IteratorIDs = - case emqx_persistent_session_ds:open_session(ClientID) of - {skipped, disabled} -> - []; - {_IsNew, _DSSessionID, Iterators0} -> - Iterators0 - end, - Session0#session{iterators = IteratorIDs}. + _ = emqx_persistent_session_ds:open_session(ClientID), + Session0. -spec init(options()) -> session(). init(Opts) -> @@ -274,9 +268,7 @@ info(awaiting_rel_max, #session{max_awaiting_rel = Max}) -> info(await_rel_timeout, #session{await_rel_timeout = Timeout}) -> Timeout; info(created_at, #session{created_at = CreatedAt}) -> - CreatedAt; -info(iterators, #session{iterators = IteratorIds}) -> - IteratorIds. + CreatedAt. %% @doc Get stats of the session. -spec stats(session()) -> emqx_types:stats(). @@ -325,15 +317,8 @@ is_subscriptions_full(#session{ -spec add_persistent_subscription(emqx_types:topic(), emqx_types:clientid(), session()) -> session(). add_persistent_subscription(TopicFilterBin, ClientId, Session) -> - case emqx_persistent_session_ds:add_subscription(TopicFilterBin, ClientId) of - {skipped, disabled} -> - Session; - {ok, IteratorID, _IsNew = true} -> - Iterators = Session#session.iterators, - Session#session{iterators = [IteratorID | Iterators]}; - {ok, _IteratorID, _IsNew = false} -> - Session - end. + _ = emqx_persistent_session_ds:add_subscription(TopicFilterBin, ClientId), + Session. %%-------------------------------------------------------------------- %% Client -> Broker: UNSUBSCRIBE diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index b669be889..7ca1f3f15 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -148,18 +148,15 @@ t_session_subscription_iterators(Config) -> ct:pal("publishing 4"), Message4 = emqx_message:make(AnotherTopic, Payload4), publish(Node1, Message4), - IteratorIds = get_iterator_ids(Node1, ClientId), emqtt:stop(Client), #{ - messages => [Message1, Message2, Message3, Message4], - iterator_ids => IteratorIds + messages => [Message1, Message2, Message3, Message4] } end, fun(Results, Trace) -> ct:pal("trace:\n ~p", [Trace]), #{ - messages := [_Message1, Message2, Message3 | _], - iterator_ids := IteratorIds + messages := [_Message1, Message2, Message3 | _] } = Results, case ?of_kind(ds_session_subscription_added, Trace) of [] -> @@ -182,10 +179,9 @@ t_session_subscription_iterators(Config) -> ), ok end, - ?assertMatch([_], IteratorIds), ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), - ?assertMatch({ok, [_]}, get_all_iterator_ids(Node2)), - [IteratorId] = IteratorIds, + {ok, [IteratorId]} = get_all_iterator_ids(Node1), + ?assertMatch({ok, [IteratorId]}, get_all_iterator_ids(Node2)), ReplayMessages1 = erpc:call(Node1, fun() -> consume(?DS_SHARD, IteratorId) end), ExpectedMessages = [Message2, Message3], ?assertEqual(ExpectedMessages, ReplayMessages1), @@ -284,7 +280,4 @@ get_mqtt_port(Node, Type) -> Port. get_all_iterator_ids(Node) -> - Fn = fun(K, _V, Acc) -> [K | Acc] end, - erpc:call(Node, fun() -> - emqx_ds_storage_layer:foldl_iterator_prefix(?DS_SHARD, <<>>, Fn, []) - end). + erpc:call(Node, emqx_ds_storage_layer, list_iterator_prefix, [?DS_SHARD, <<>>]). diff --git a/apps/emqx/test/emqx_proper_types.erl b/apps/emqx/test/emqx_proper_types.erl index 56e0b23b8..ab1720754 100644 --- a/apps/emqx/test/emqx_proper_types.erl +++ b/apps/emqx/test/emqx_proper_types.erl @@ -147,8 +147,7 @@ sessioninfo() -> awaiting_rel = awaiting_rel(), max_awaiting_rel = non_neg_integer(), await_rel_timeout = safty_timeout(), - created_at = timestamp(), - iterators = [] + created_at = timestamp() }, emqx_session:info(Session) ). diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 297c7d857..3cc7ca886 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -15,6 +15,7 @@ %%-------------------------------------------------------------------- -module(emqx_ds). +-include_lib("stdlib/include/ms_transform.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). %% API: @@ -124,17 +125,15 @@ message_stats() -> %% %% Note: session API doesn't handle session takeovers, it's the job of %% the broker. --spec session_open(emqx_types:clientid()) -> {_New :: boolean(), session_id(), [iterator_id()]}. +-spec session_open(emqx_types:clientid()) -> {_New :: boolean(), session_id()}. session_open(ClientID) -> case mnesia:dirty_read(?SESSION_TAB, ClientID) of - [#session{iterators = Iterators}] -> - IteratorIDs = maps:values(Iterators), - {false, ClientID, IteratorIDs}; + [#session{}] -> + {false, ClientID}; [] -> - Iterators = #{}, - Session = #session{id = ClientID, iterators = Iterators}, + Session = #session{id = ClientID}, mria:dirty_write(?SESSION_TAB, Session), - {true, ClientID, _IteratorIDs = []} + {true, ClientID} end. %% @doc Called when a client reconnects with `clean session=true' or diff --git a/apps/emqx_durable_storage/src/emqx_ds_int.hrl b/apps/emqx_durable_storage/src/emqx_ds_int.hrl index 55223068f..47493bd0b 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_int.hrl +++ b/apps/emqx_durable_storage/src/emqx_ds_int.hrl @@ -21,8 +21,10 @@ -define(DS_SHARD, emqx_ds_shard). -record(session, { + %% same as clientid id :: emqx_ds:session_id(), - iterators :: #{emqx_topic:words() => emqx_ds:iterator_id()} + %% for future usage + props = #{} :: map() }). -record(iterator_ref, { diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 9bc7924e8..adede5322 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -19,6 +19,7 @@ discard_iterator/2, is_iterator_present/2, discard_iterator_prefix/2, + list_iterator_prefix/2, foldl_iterator_prefix/4 ]). @@ -203,7 +204,17 @@ discard_iterator(Shard, ReplayID) -> -spec discard_iterator_prefix(emqx_ds:shard(), binary()) -> ok | {error, _TODO}. discard_iterator_prefix(Shard, KeyPrefix) -> - do_discard_iterator_prefix(Shard, KeyPrefix). + case do_discard_iterator_prefix(Shard, KeyPrefix) of + {ok, _} -> ok; + Error -> Error + end. + +-spec list_iterator_prefix( + emqx_ds:shard(), + binary() +) -> {ok, [emqx_ds:iterator_id()]} | {error, _TODO}. +list_iterator_prefix(Shard, KeyPrefix) -> + do_list_iterator_prefix(Shard, KeyPrefix). -spec foldl_iterator_prefix( emqx_ds:shard(), @@ -377,7 +388,11 @@ open_restore_iterator(#{module := Mod, data := Data}, It = #it{replay = Replay}, %% --define(KEY_REPLAY_STATE(ReplayID), <<(ReplayID)/binary, "rs">>). +-define(KEY_REPLAY_STATE(IteratorId), <<(IteratorId)/binary, "rs">>). +-define(KEY_REPLAY_STATE_PAT(KeyReplayState), begin + <> = (KeyReplayState), + IteratorId +end). -define(ITERATION_WRITE_OPTS, []). -define(ITERATION_READ_OPTS, []). @@ -424,6 +439,13 @@ restore_iterator_state( It = #it{shard = Shard, gen = Gen, replay = {TopicFilter, StartTime}}, open_restore_iterator(meta_get_gen(Shard, Gen), It, State). +do_list_iterator_prefix(Shard, KeyPrefix) -> + Fn = fun(K0, _V, Acc) -> + K = ?KEY_REPLAY_STATE_PAT(K0), + [K | Acc] + end, + do_foldl_iterator_prefix(Shard, KeyPrefix, Fn, []). + do_discard_iterator_prefix(Shard, KeyPrefix) -> #db{handle = DBHandle, cf_iterator = CF} = meta_lookup(Shard, db), Fn = fun(K, _V, _Acc) -> ok = rocksdb:delete(DBHandle, CF, K, ?ITERATION_WRITE_OPTS) end, diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl index 8a2d18c0d..73eb28d85 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -169,7 +169,7 @@ t_session_subscription_idempotency(Config) -> %% Exactly one iterator should have been opened. ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), ?assertMatch( - {_IsNew = false, ClientId, _}, + {_IsNew = false, ClientId}, erpc:call(Node1, emqx_ds, session_open, [ClientId]) ), ok From c74abe79d072bb1a9309e6c812434a21cbab86b1 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 15 Aug 2023 16:07:30 -0300 Subject: [PATCH 20/85] refactor: reduce arity --- apps/emqx/src/emqx_cm.erl | 3 +-- apps/emqx/src/emqx_session.erl | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index ae6efb89c..2cc2b72b4 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -301,9 +301,8 @@ open_session(false, ClientInfo = #{clientid := ClientId}, ConnInfo) -> emqx_cm_locker:trans(ClientId, ResumeStart). create_session(ClientInfo, ConnInfo) -> - #{clientid := ClientId} = ClientInfo, Options = get_session_confs(ClientInfo, ConnInfo), - Session = emqx_session:init_and_open(ClientId, Options), + Session = emqx_session:init_and_open(Options), ok = emqx_metrics:inc('session.created'), ok = emqx_hooks:run('session.created', [ClientInfo, emqx_session:info(Session)]), Session. diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 32c98290a..0c051f002 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -60,7 +60,7 @@ unpersist/1 ]). --export([init/1, init_and_open/2]). +-export([init/1, init_and_open/1]). -export([ info/1, @@ -166,8 +166,9 @@ %% Init a Session %%-------------------------------------------------------------------- --spec init_and_open(emqx_types:clientid(), options()) -> session(). -init_and_open(ClientID, Options) -> +-spec init_and_open(options()) -> session(). +init_and_open(Options) -> + #{clientid := ClientID} = Options, Session0 = emqx_session:init(Options), _ = emqx_persistent_session_ds:open_session(ClientID), Session0. From a15405a800ed1406ffb2e82cc2f0ef43e1d9fb34 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 15 Aug 2023 16:17:37 -0300 Subject: [PATCH 21/85] test: fix assertions --- apps/emqx/test/emqx_persistent_messages_SUITE.erl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index 7ca1f3f15..9d814aad6 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -132,19 +132,18 @@ t_session_subscription_iterators(Config) -> ct:pal("publishing 1"), Message1 = emqx_message:make(Topic, Payload1), publish(Node1, Message1), - receive_messages(1), ct:pal("subscribing 1"), {ok, _, [2]} = emqtt:subscribe(Client, SubTopicFilter, qos2), ct:pal("publishing 2"), Message2 = emqx_message:make(Topic, Payload2), publish(Node1, Message2), - receive_messages(1), + [_] = receive_messages(1), ct:pal("subscribing 2"), {ok, _, [1]} = emqtt:subscribe(Client, SubTopicFilter, qos1), ct:pal("publishing 3"), Message3 = emqx_message:make(Topic, Payload3), publish(Node1, Message3), - receive_messages(1), + [_] = receive_messages(1), ct:pal("publishing 4"), Message4 = emqx_message:make(AnotherTopic, Payload4), publish(Node1, Message4), From 6de0bbe76a6611c52d5b59204ce718c03ca1c135 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 15 Aug 2023 16:20:36 -0300 Subject: [PATCH 22/85] test(refactor): always allocate listeners for emqx app --- apps/emqx/test/emqx_cth_cluster.erl | 2 +- apps/emqx/test/emqx_persistent_messages_SUITE.erl | 1 - apps/emqx_durable_storage/test/emqx_ds_SUITE.erl | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index cbbad0aa2..1a83056cb 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -245,7 +245,7 @@ default_appspec(emqx_conf, Spec, _NodeSpecs) -> listeners => allocate_listener_ports([tcp, ssl, ws, wss], Spec) } }; -default_appspec(emqx, Spec = #{listeners := true}, _NodeSpecs) -> +default_appspec(emqx, Spec, _NodeSpecs) -> #{config => #{listeners => allocate_listener_ports([tcp, ssl, ws, wss], Spec)}}; default_appspec(_App, _, _) -> #{}. diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index 9d814aad6..81cc3dade 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -266,7 +266,6 @@ cluster() -> Spec = #{ role => core, join_to => emqx_cth_cluster:node_name(Node1), - listeners => true, apps => app_specs() }, [ diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl index 73eb28d85..c79856fc7 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -57,7 +57,6 @@ cluster(#{n := N}) -> Spec = #{ role => core, join_to => emqx_cth_cluster:node_name(Node1), - listeners => true, apps => app_specs() }, [ From e8c73b06e14c256a2f29eaf6536f37ee6877c99e Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 15 Aug 2023 16:28:42 -0300 Subject: [PATCH 23/85] docs: add comment about future test failure --- apps/emqx/test/emqx_persistent_messages_SUITE.erl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index 81cc3dade..e055ce794 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -183,6 +183,8 @@ t_session_subscription_iterators(Config) -> ?assertMatch({ok, [IteratorId]}, get_all_iterator_ids(Node2)), ReplayMessages1 = erpc:call(Node1, fun() -> consume(?DS_SHARD, IteratorId) end), ExpectedMessages = [Message2, Message3], + %% Note: it is expected that this will break after replayers are in place. + %% They might have consumed all the messages by this time. ?assertEqual(ExpectedMessages, ReplayMessages1), %% Different DS shard ReplayMessages2 = erpc:call(Node2, fun() -> consume(?DS_SHARD, IteratorId) end), From c46b8de9382454a91fdb0beeb71f0efb4dc9d77f Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 16 Aug 2023 15:50:01 -0300 Subject: [PATCH 24/85] test: remove unused things, refactor some functions --- .../test/emqx_persistent_messages_SUITE.erl | 14 ++---------- .../test/emqx_persistent_session_SUITE.erl | 22 ------------------- 2 files changed, 2 insertions(+), 34 deletions(-) diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index e055ce794..dbd4df0ae 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -23,18 +23,12 @@ -compile(export_all). -compile(nowarn_export_all). --define(NOW, - (calendar:system_time_to_rfc3339(erlang:system_time(millisecond), [{unit, millisecond}])) -). -define(DS_SHARD, <<"local">>). all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - %% avoid inter-suite flakiness... - application:stop(emqx), - application:stop(emqx_durable_storage), TCApps = emqx_cth_suite:start( app_specs(), #{work_dir => ?config(priv_dir, Config)} @@ -232,12 +226,8 @@ receive_messages(0, Msgs) -> receive_messages(Count, Msgs) -> receive {publish, Msg} -> - receive_messages(Count - 1, [Msg | Msgs]); - {deliver, _Topic, Msg} -> - receive_messages(Count - 1, [Msg | Msgs]); - _Other -> - receive_messages(Count, Msgs) - after 5000 -> + receive_messages(Count - 1, [Msg | Msgs]) + after 5_000 -> Msgs end. diff --git a/apps/emqx/test/emqx_persistent_session_SUITE.erl b/apps/emqx/test/emqx_persistent_session_SUITE.erl index d8736b918..07cfabc70 100644 --- a/apps/emqx/test/emqx_persistent_session_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_session_SUITE.erl @@ -267,8 +267,6 @@ receive_messages(Count, Msgs) -> receive {publish, Msg} -> receive_messages(Count - 1, [Msg | Msgs]); - {deliver, _Topic, Msg} -> - receive_messages(Count - 1, [Msg | Msgs]); _Other -> receive_messages(Count, Msgs) after 5000 -> @@ -375,26 +373,6 @@ do_publish(Payloads = [_ | _], PublishFun, WaitForUnregister) -> do_publish(Payload, PublishFun, WaitForUnregister) -> do_publish([Payload], PublishFun, WaitForUnregister). -get_replay_messages(ReplayID) -> - DSShard = <<"local">>, - case emqx_ds_storage_layer:restore_iterator(DSShard, ReplayID) of - {ok, It} -> - do_get_replay_messages(It, []); - Error -> - error({"error restoring iterator", #{error => Error, replay_id => ReplayID}}) - end. - -do_get_replay_messages(It, Acc) -> - case emqx_ds_storage_layer:next(It) of - {value, Val, NewIt} -> - Msg = erlang:binary_to_term(Val), - do_get_replay_messages(NewIt, [Msg | Acc]); - none -> - {ok, lists:reverse(Acc)}; - {error, Reason} -> - {error, Reason} - end. - %%-------------------------------------------------------------------- %% Test Cases %%-------------------------------------------------------------------- From 3344bfb0bd74c5fa9542a54e278f72eb39424cdf Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 16 Aug 2023 15:56:29 -0300 Subject: [PATCH 25/85] refactor: rm `emqx_ds_replay` --- apps/emqx_durable_storage/src/emqx_ds.erl | 9 +++++ .../src/emqx_ds_message_storage_bitmask.erl | 8 ++--- .../src/emqx_ds_replay.erl | 36 ------------------- .../src/emqx_ds_storage_layer.erl | 14 ++++---- .../emqx_ds_message_storage_bitmask_shim.erl | 2 +- 5 files changed, 21 insertions(+), 48 deletions(-) delete mode 100644 apps/emqx_durable_storage/src/emqx_ds_replay.erl diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 3cc7ca886..7ec9f3801 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -42,6 +42,8 @@ message_stats/0, message_store_opts/0, session_id/0, + replay/0, + replay_id/0, iterator_id/0, iterator/0, shard/0, @@ -80,6 +82,13 @@ %% use in emqx_guid. Otherwise, the iterators won't match the message timestamps. -type time() :: non_neg_integer(). +-type replay_id() :: binary(). + +-type replay() :: { + _TopicFilter :: emqx_topic:words(), + _StartTime :: time() +}. + %%================================================================================ %% API funcions %%================================================================================ diff --git a/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl b/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl index 74a50c302..57608e5cb 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_message_storage_bitmask.erl @@ -277,13 +277,13 @@ store(DB = #db{handle = DBHandle, cf = CFHandle}, MessageID, PublishedAt, Topic, Value = make_message_value(Topic, MessagePayload), rocksdb:put(DBHandle, CFHandle, Key, Value, DB#db.write_options). --spec make_iterator(db(), emqx_ds_replay:replay()) -> +-spec make_iterator(db(), emqx_ds:replay()) -> {ok, iterator()} | {error, _TODO}. make_iterator(DB, Replay) -> Options = emqx_ds_conf:shard_iteration_options(DB#db.shard), make_iterator(DB, Replay, Options). --spec make_iterator(db(), emqx_ds_replay:replay(), iteration_options()) -> +-spec make_iterator(db(), emqx_ds:replay(), iteration_options()) -> % {error, invalid_start_time}? might just start from the beginning of time % and call it a day: client violated the contract anyway. {ok, iterator()} | {error, _TODO}. @@ -337,7 +337,7 @@ preserve_iterator(#it{cursor = Cursor}) -> }, term_to_binary(State). --spec restore_iterator(db(), emqx_ds_replay:replay(), binary()) -> +-spec restore_iterator(db(), emqx_ds:replay(), binary()) -> {ok, iterator()} | {error, _TODO}. restore_iterator(DB, Replay, Serial) when is_binary(Serial) -> State = binary_to_term(Serial), @@ -419,7 +419,7 @@ hash(Input, Bits) -> % at most 32 bits erlang:phash2(Input, 1 bsl Bits). --spec make_keyspace_filter(emqx_ds_replay:replay(), keymapper()) -> keyspace_filter(). +-spec make_keyspace_filter(emqx_ds:replay(), keymapper()) -> keyspace_filter(). make_keyspace_filter({TopicFilter, StartTime}, Keymapper) -> Bitstring = compute_bitstring(TopicFilter, StartTime, Keymapper), HashBitmask = compute_topic_bitmask(TopicFilter, Keymapper), diff --git a/apps/emqx_durable_storage/src/emqx_ds_replay.erl b/apps/emqx_durable_storage/src/emqx_ds_replay.erl deleted file mode 100644 index b9ffa32ac..000000000 --- a/apps/emqx_durable_storage/src/emqx_ds_replay.erl +++ /dev/null @@ -1,36 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%%-------------------------------------------------------------------- --module(emqx_ds_replay). - -%% API: --export([]). - --export_type([replay_id/0, replay/0]). - -%%================================================================================ -%% Type declarations -%%================================================================================ - --type replay_id() :: binary(). - --type replay() :: { - _TopicFilter :: emqx_ds:words(), - _StartTime :: emqx_ds:time() -}. - -%%================================================================================ -%% API funcions -%%================================================================================ - -%%================================================================================ -%% behaviour callbacks -%%================================================================================ - -%%================================================================================ -%% Internal exports -%%================================================================================ - -%%================================================================================ -%% Internal functions -%%================================================================================ diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index adede5322..69c0e008c 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -71,7 +71,7 @@ -record(it, { shard :: emqx_ds:shard(), gen :: gen_id(), - replay :: emqx_ds_replay:replay(), + replay :: emqx_ds:replay(), module :: module(), data :: term() }). @@ -112,10 +112,10 @@ -callback store(_Schema, binary(), emqx_ds:time(), emqx_ds:topic(), binary()) -> ok | {error, _}. --callback make_iterator(_Schema, emqx_ds_replay:replay()) -> +-callback make_iterator(_Schema, emqx_ds:replay()) -> {ok, _It} | {error, _}. --callback restore_iterator(_Schema, emqx_ds_replay:replay(), binary()) -> {ok, _It} | {error, _}. +-callback restore_iterator(_Schema, emqx_ds:replay(), binary()) -> {ok, _It} | {error, _}. -callback preserve_iterator(_Schema, _It) -> term(). @@ -140,7 +140,7 @@ store(Shard, GUID, Time, Topic, Msg) -> {_GenId, #{module := Mod, data := Data}} = meta_lookup_gen(Shard, Time), Mod:store(Data, GUID, Time, Topic, Msg). --spec make_iterator(emqx_ds:shard(), emqx_ds_replay:replay()) -> +-spec make_iterator(emqx_ds:shard(), emqx_ds:replay()) -> {ok, iterator()} | {error, _TODO}. make_iterator(Shard, Replay = {_, StartTime}) -> {GenId, Gen} = meta_lookup_gen(Shard, StartTime), @@ -173,7 +173,7 @@ next(It = #it{module = Mod, data = ItData}) -> preserve_iterator(It = #it{}, IteratorID) -> iterator_put_state(IteratorID, It). --spec restore_iterator(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> +-spec restore_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> {ok, iterator()} | {error, _TODO}. restore_iterator(Shard, ReplayID) -> case iterator_get_state(Shard, ReplayID) of @@ -185,7 +185,7 @@ restore_iterator(Shard, ReplayID) -> Error end. --spec is_iterator_present(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> +-spec is_iterator_present(emqx_ds:shard(), emqx_ds:replay_id()) -> boolean(). is_iterator_present(Shard, ReplayID) -> %% TODO: use keyMayExist after added to wrapper? @@ -196,7 +196,7 @@ is_iterator_present(Shard, ReplayID) -> false end. --spec discard_iterator(emqx_ds:shard(), emqx_ds_replay:replay_id()) -> +-spec discard_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> ok | {error, _TODO}. discard_iterator(Shard, ReplayID) -> iterator_delete(Shard, ReplayID). diff --git a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl index 10431eb1a..e9daf2581 100644 --- a/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl +++ b/apps/emqx_durable_storage/test/props/emqx_ds_message_storage_bitmask_shim.erl @@ -31,7 +31,7 @@ store(Tab, MessageID, PublishedAt, Topic, Payload) -> true = ets:insert(Tab, {{PublishedAt, MessageID}, Topic, Payload}), ok. --spec iterate(t(), emqx_ds_replay:replay()) -> +-spec iterate(t(), emqx_ds:replay()) -> [binary()]. iterate(Tab, {TopicFilter, StartTime}) -> ets:foldr( From 65085d012b430d6192ecf5be4047ae713a39f65d Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 16 Aug 2023 15:57:56 -0300 Subject: [PATCH 26/85] refactor: rename fn --- apps/emqx/src/emqx_persistent_session_ds.erl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 19e11b1a3..b862ef0d9 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -102,18 +102,19 @@ add_subscription(TopicFilterBin, DSSessionID) -> ?tp_span( persistent_session_ds_open_iterators, Ctx, - ok = open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) + ok = open_iterator_on_all_shards(TopicFilter, StartMS, IteratorID) ), {ok, IteratorID, IsNew} end ). --spec open_iterator_on_all_nodes(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. -open_iterator_on_all_nodes(TopicFilter, StartMS, IteratorID) -> +-spec open_iterator_on_all_shards(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. +open_iterator_on_all_shards(TopicFilter, StartMS, IteratorID) -> ?tp(persistent_session_ds_will_open_iterators, #{ iterator_id => IteratorID, start_time => StartMS }), + %% Note: currently, shards map 1:1 to nodes, but this will change in the future. Nodes = emqx:running_nodes(), Results = emqx_persistent_session_ds_proto_v1:open_iterator( Nodes, TopicFilter, StartMS, IteratorID From dbfacae283a81bc51c306fdf20b1d41015fa94c6 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 16 Aug 2023 16:05:08 -0300 Subject: [PATCH 27/85] fix: reinstate transactions --- apps/emqx_durable_storage/src/emqx_ds.erl | 77 +++++++++++++---------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 7ec9f3801..889e7ea24 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -136,20 +136,29 @@ message_stats() -> %% the broker. -spec session_open(emqx_types:clientid()) -> {_New :: boolean(), session_id()}. session_open(ClientID) -> - case mnesia:dirty_read(?SESSION_TAB, ClientID) of - [#session{}] -> - {false, ClientID}; - [] -> - Session = #session{id = ClientID}, - mria:dirty_write(?SESSION_TAB, Session), - {true, ClientID} - end. + {atomic, Res} = + mria:transaction(?DS_SHARD, fun() -> + case mnesia:read(?SESSION_TAB, ClientID, write) of + [#session{}] -> + {false, ClientID}; + [] -> + Session = #session{id = ClientID}, + mnesia:write(?SESSION_TAB, Session, write), + {true, ClientID} + end + end), + Res. %% @doc Called when a client reconnects with `clean session=true' or %% during session GC -spec session_drop(emqx_types:clientid()) -> ok. session_drop(ClientID) -> - ok = mria:dirty_delete({?SESSION_TAB, ClientID}), + {atomic, ok} = mria:transaction( + ?DS_SHARD, + fun() -> + mnesia:delete({?SESSION_TAB, ClientID}) + end + ), ok. %% @doc Called when a client disconnects. This function terminates all @@ -164,29 +173,33 @@ session_suspend(_SessionId) -> {ok, iterator_id(), time(), _IsNew :: boolean()}. session_add_iterator(DSSessionId, TopicFilter) -> IteratorRefId = {DSSessionId, TopicFilter}, - case mnesia:dirty_read(?ITERATOR_REF_TAB, IteratorRefId) of - [] -> - {IteratorId, StartMS} = new_iterator_id(DSSessionId), - IteratorRef = #iterator_ref{ - ref_id = IteratorRefId, - it_id = IteratorId, - start_time = StartMS - }, - ok = mria:dirty_write(?ITERATOR_REF_TAB, IteratorRef), - ?tp( - ds_session_subscription_added, - #{iterator_id => IteratorId, session_id => DSSessionId} - ), - IsNew = true, - {ok, IteratorId, StartMS, IsNew}; - [#iterator_ref{it_id = IteratorId, start_time = StartMS}] -> - ?tp( - ds_session_subscription_present, - #{iterator_id => IteratorId, session_id => DSSessionId} - ), - IsNew = false, - {ok, IteratorId, StartMS, IsNew} - end. + {atomic, Res} = + mria:transaction(?DS_SHARD, fun() -> + case mnesia:read(?ITERATOR_REF_TAB, IteratorRefId, write) of + [] -> + {IteratorId, StartMS} = new_iterator_id(DSSessionId), + IteratorRef = #iterator_ref{ + ref_id = IteratorRefId, + it_id = IteratorId, + start_time = StartMS + }, + ok = mnesia:write(?ITERATOR_REF_TAB, IteratorRef, write), + ?tp( + ds_session_subscription_added, + #{iterator_id => IteratorId, session_id => DSSessionId} + ), + IsNew = true, + {ok, IteratorId, StartMS, IsNew}; + [#iterator_ref{it_id = IteratorId, start_time = StartMS}] -> + ?tp( + ds_session_subscription_present, + #{iterator_id => IteratorId, session_id => DSSessionId} + ), + IsNew = false, + {ok, IteratorId, StartMS, IsNew} + end + end), + Res. %% @doc Called when a client unsubscribes from a topic. Returns `true' %% if the session contained the subscription or `false' if it wasn't From dad27091bea87062c6e2630bdbcdd3fcdc8c67b6 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 16 Aug 2023 17:43:33 -0300 Subject: [PATCH 28/85] test: rm custom option --- apps/emqx/test/emqx_cth_cluster.erl | 14 ++--- apps/emqx/test/emqx_cth_suite.erl | 2 - .../test/emqx_ds_SUITE.erl | 59 +++++++++++++++---- 3 files changed, 50 insertions(+), 25 deletions(-) diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index 1a83056cb..e24600181 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -20,7 +20,8 @@ -export([stop/1, stop_node/1]). -export([share_load_module/2]). --export([node_name/1]). +-export([node_name/1, mk_nodespecs/2]). +-export([start_apps/2, set_node_opts/2]). -define(APPS_CLUSTERING, [gen_rpc, mria, ekka]). @@ -80,12 +81,7 @@ when %% Working directory %% Everything a test produces should go here. Each node's stuff should go in its %% own directory. - work_dir := file:name(), - %% Usually, we want to ensure the node / test suite starts from a clean slate. - %% However, sometimes, we may want to test restarting a node. For such - %% situations, we need to disable this check to allow resuming from an existing - %% state. - skip_clean_suite_state_check => boolean() + work_dir := file:name() }. start(Nodes, ClusterOpts) -> NodeSpecs = mk_nodespecs(Nodes, ClusterOpts), @@ -129,14 +125,12 @@ mk_init_nodespec(N, Name, NodeOpts, ClusterOpts) -> Node = node_name(Name), BasePort = base_port(N), WorkDir = maps:get(work_dir, ClusterOpts), - SkipCleanSuiteStateCheck = maps:get(skip_clean_suite_state_check, ClusterOpts, false), Defaults = #{ name => Node, role => core, apps => [], base_port => BasePort, work_dir => filename:join([WorkDir, Node]), - skip_clean_suite_state_check => SkipCleanSuiteStateCheck, driver => ct_slave }, maps:merge(Defaults, NodeOpts). @@ -307,7 +301,7 @@ start_apps(Node, #{apps := Apps} = Spec) -> ok. suite_opts(Spec) -> - maps:with([work_dir, skip_clean_suite_state_check], Spec). + maps:with([work_dir], Spec). maybe_join_cluster(_Node, #{role := replicant}) -> ok; diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index dbe9423da..9b3e58da4 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -358,8 +358,6 @@ stop_apps(Apps) -> %% -verify_clean_suite_state(#{skip_clean_suite_state_check := true}) -> - ok; verify_clean_suite_state(#{work_dir := WorkDir}) -> {ok, []} = file:list_dir(WorkDir), none = persistent_term:get(?EMQX_AUTHENTICATION_SCHEMA_MODULE_PT_KEY, none), diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl index c79856fc7..842782e35 100644 --- a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl +++ b/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl @@ -20,9 +20,6 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - %% avoid inter-suite flakiness... - application:stop(emqx), - application:stop(emqx_durable_storage), TCApps = emqx_cth_suite:start( app_specs(), #{work_dir => ?config(priv_dir, Config)} @@ -36,8 +33,16 @@ end_per_suite(Config) -> init_per_testcase(t_session_subscription_idempotency, Config) -> Cluster = cluster(#{n => 1}), - Nodes = emqx_cth_cluster:start(Cluster, #{work_dir => ?config(priv_dir, Config)}), - [{cluster, Cluster}, {nodes, Nodes} | Config]; + ClusterOpts = #{work_dir => ?config(priv_dir, Config)}, + NodeSpecs = emqx_cth_cluster:mk_nodespecs(Cluster, ClusterOpts), + Nodes = emqx_cth_cluster:start(Cluster, ClusterOpts), + [ + {cluster, Cluster}, + {node_specs, NodeSpecs}, + {cluster_opts, ClusterOpts}, + {nodes, Nodes} + | Config + ]; init_per_testcase(_TestCase, Config) -> Config. @@ -92,12 +97,28 @@ get_all_iterator_ids(Node) -> emqx_ds_storage_layer:foldl_iterator_prefix(?DS_SHARD, <<>>, Fn, []) end). +wait_nodeup(Node) -> + ?retry( + _Sleep0 = 500, + _Attempts0 = 50, + pong = net_adm:ping(Node) + ). + +wait_gen_rpc_down(_NodeSpec = #{apps := Apps}) -> + #{override_env := Env} = proplists:get_value(gen_rpc, Apps), + Port = proplists:get_value(tcp_server_port, Env), + ?retry( + _Sleep0 = 500, + _Attempts0 = 50, + false = emqx_common_test_helpers:is_tcp_server_available("127.0.0.1", Port) + ). + %%------------------------------------------------------------------------------ %% Testcases %%------------------------------------------------------------------------------ t_session_subscription_idempotency(Config) -> - Cluster = ?config(cluster, Config), + [Node1Spec | _] = ?config(node_specs, Config), [Node1] = ?config(nodes, Config), Port = get_mqtt_port(Node1, tcp), SubTopicFilter = <<"t/+">>, @@ -119,13 +140,25 @@ t_session_subscription_idempotency(Config) -> spawn_link(fun() -> ?tp(will_restart_node, #{}), - ct:pal("stopping node ~p", [Node1]), - ok = emqx_cth_cluster:stop_node(Node1), - ct:pal("stopped node ~p; restarting...", [Node1]), - [Node1] = emqx_cth_cluster:start(Cluster, #{ - work_dir => ?config(priv_dir, Config), - skip_clean_suite_state_check => true - }), + ct:pal("restarting node ~p", [Node1]), + true = monitor_node(Node1, true), + ok = erpc:call(Node1, init, restart, []), + receive + {nodedown, Node1} -> + ok + after 10_000 -> + ct:fail("node ~p didn't stop", [Node1]) + end, + ct:pal("waiting for nodeup ~p", [Node1]), + wait_nodeup(Node1), + wait_gen_rpc_down(Node1Spec), + ct:pal("restarting apps on ~p", [Node1]), + Apps = maps:get(apps, Node1Spec), + ok = erpc:call(Node1, emqx_cth_suite, load_apps, [Apps]), + _ = erpc:call(Node1, emqx_cth_suite, start_apps, [Apps, Node1Spec]), + %% have to re-inject this so that we may stop the node succesfully at the + %% end.... + ok = emqx_cth_cluster:set_node_opts(Node1, Node1Spec), ct:pal("node ~p restarted", [Node1]), ?tp(restarted_node, #{}), ok From c1f49abad226d933fae928fdfd5720937673c6f1 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Thu, 17 Aug 2023 14:54:57 -0300 Subject: [PATCH 29/85] test: fix inter-suite flakiness --- apps/emqx/test/emqx_crl_cache_SUITE.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/emqx/test/emqx_crl_cache_SUITE.erl b/apps/emqx/test/emqx_crl_cache_SUITE.erl index 6c6337038..248013ce9 100644 --- a/apps/emqx/test/emqx_crl_cache_SUITE.erl +++ b/apps/emqx/test/emqx_crl_cache_SUITE.erl @@ -41,6 +41,7 @@ init_per_suite(Config) -> Config. end_per_suite(_Config) -> + emqx_config:erase_all(), ok. init_per_testcase(TestCase, Config) when From ee2897e5dedfb671de6221bd498f33b35497885e Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 16 Aug 2023 17:54:49 -0300 Subject: [PATCH 30/85] test(refactor): move test to integration tests dir --- .../test => emqx/integration_test}/emqx_ds_SUITE.erl | 0 apps/emqx/test/emqx_cth_suite.erl | 8 +++++++- apps/emqx/test/emqx_persistent_messages_SUITE.erl | 7 ++++++- 3 files changed, 13 insertions(+), 2 deletions(-) rename apps/{emqx_durable_storage/test => emqx/integration_test}/emqx_ds_SUITE.erl (100%) diff --git a/apps/emqx_durable_storage/test/emqx_ds_SUITE.erl b/apps/emqx/integration_test/emqx_ds_SUITE.erl similarity index 100% rename from apps/emqx_durable_storage/test/emqx_ds_SUITE.erl rename to apps/emqx/integration_test/emqx_ds_SUITE.erl diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index 9b3e58da4..80b3a578c 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -101,7 +101,13 @@ when %% function will raise an error. work_dir := file:name() }. -start(Apps, SuiteOpts = #{work_dir := WorkDir}) -> +start(Apps, SuiteOpts0 = #{work_dir := WorkDir0}) -> + %% when running CT on the whole app, it seems like `priv_dir` is the same on all + %% suites and leads to the "clean slate" verificatin to fail. + WorkDir = binary_to_list( + filename:join([WorkDir0, emqx_guid:to_hexstr(emqx_guid:gen())]) + ), + SuiteOpts = SuiteOpts0#{work_dir := WorkDir}, % 1. Prepare appspec instructions AppSpecs = [mk_appspec(App, SuiteOpts) || App <- Apps], % 2. Load every app so that stuff scanning attributes of loaded modules works diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index dbd4df0ae..bb2921de1 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -29,9 +29,14 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> + %% avoid inter-suite flakiness... + %% TODO: remove after other suites start to use `emx_cth_suite' + application:stop(emqx), + application:stop(emqx_durable_storage), + WorkDir = ?config(priv_dir, Config), TCApps = emqx_cth_suite:start( app_specs(), - #{work_dir => ?config(priv_dir, Config)} + #{work_dir => WorkDir} ), [{tc_apps, TCApps} | Config]. From f007b4442670e6274fa438257fbff15484c373a5 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 21 Aug 2023 10:58:03 -0300 Subject: [PATCH 31/85] fix(data_import): rm duplicate import call and fix test --- apps/emqx_management/src/emqx_mgmt_data_backup.erl | 1 - apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/emqx_management/src/emqx_mgmt_data_backup.erl b/apps/emqx_management/src/emqx_mgmt_data_backup.erl index b83a46903..b677da2b2 100644 --- a/apps/emqx_management/src/emqx_mgmt_data_backup.erl +++ b/apps/emqx_management/src/emqx_mgmt_data_backup.erl @@ -525,7 +525,6 @@ do_import_conf(RawConf, Opts) -> Errors = lists:foldr( fun(Module, ErrorsAcc) -> - Module:import_config(RawConf), case Module:import_config(RawConf) of {ok, #{changed := Changed}} -> maybe_print_changed(Changed, Opts), diff --git a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl index f9b9ef766..381862995 100644 --- a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl @@ -440,8 +440,8 @@ create_test_tab(Attributes) -> apps_to_start() -> [ - {emqx_conf, "dashboard.listeners.http.bind = 0"}, {emqx, #{override_env => [{boot_modules, [broker, router]}]}}, + {emqx_conf, #{config => #{dashboard => #{listeners => #{http => #{bind => <<"0">>}}}}}}, emqx_psk, emqx_management, emqx_dashboard, From 44b080f5b36722c3dcf1236a4c45041a2c186fbe Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Wed, 23 Aug 2023 17:01:50 +0300 Subject: [PATCH 32/85] chore(ci): pin pytest-retry version --- .ci/docker-compose-file/python/pytest.sh | 2 +- .github/workflows/run_helm_tests.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker-compose-file/python/pytest.sh b/.ci/docker-compose-file/python/pytest.sh index 924c30212..649357cff 100755 --- a/.ci/docker-compose-file/python/pytest.sh +++ b/.ci/docker-compose-file/python/pytest.sh @@ -20,7 +20,7 @@ fi apk update && apk add git curl git clone -b develop-5.0 https://github.com/emqx/paho.mqtt.testing.git /paho.mqtt.testing -pip install pytest==7.1.2 pytest-retry +pip install pytest==7.1.2 pytest-retry==1.3.0 pytest --retries 3 -v /paho.mqtt.testing/interoperability/test_client/V5/test_connect.py -k test_basic --host "$TARGET_HOST" RESULT=$? diff --git a/.github/workflows/run_helm_tests.yaml b/.github/workflows/run_helm_tests.yaml index 1106b6057..ba3ebee5a 100644 --- a/.github/workflows/run_helm_tests.yaml +++ b/.github/workflows/run_helm_tests.yaml @@ -121,7 +121,7 @@ jobs: path: paho.mqtt.testing - name: install pytest run: | - pip install pytest==7.1.2 pytest-retry + pip install pytest==7.1.2 pytest-retry==1.3.0 echo "$HOME/.local/bin" >> $GITHUB_PATH - name: run paho test timeout-minutes: 10 From bac0f7107cb2312d04170a32c412038c260da9ea Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Tue, 22 Aug 2023 18:08:53 +0300 Subject: [PATCH 33/85] chore: bump OTP to 25.3.2-2 and EMQX builder to 5.1-4 --- .../docker-compose-kafka.yaml | 2 +- .ci/docker-compose-file/docker-compose.yaml | 2 +- .github/actions/package-macos/action.yaml | 2 +- .github/workflows/_pr_entrypoint.yaml | 16 ++++++++-------- .github/workflows/_push-entrypoint.yaml | 16 ++++++++-------- .../workflows/build_and_push_docker_images.yaml | 4 ++-- .github/workflows/build_packages.yaml | 4 ++-- .github/workflows/build_packages_cron.yaml | 6 +++--- .github/workflows/build_slim_packages.yaml | 10 +++++----- .github/workflows/codeql.yaml | 2 +- .github/workflows/performance_test.yaml | 2 +- .tool-versions | 2 +- Makefile | 2 +- build | 4 ++-- changes/ce/fix-11499.en.md | 3 +++ deploy/docker/Dockerfile | 2 +- scripts/buildx.sh | 4 ++-- scripts/pr-sanity-checks.sh | 4 ++-- scripts/relup-test/start-relup-test-cluster.sh | 2 +- 19 files changed, 46 insertions(+), 43 deletions(-) create mode 100644 changes/ce/fix-11499.en.md diff --git a/.ci/docker-compose-file/docker-compose-kafka.yaml b/.ci/docker-compose-file/docker-compose-kafka.yaml index 18ef3991c..f5bdb24ec 100644 --- a/.ci/docker-compose-file/docker-compose-kafka.yaml +++ b/.ci/docker-compose-file/docker-compose-kafka.yaml @@ -18,7 +18,7 @@ services: - /tmp/emqx-ci/emqx-shared-secret:/var/lib/secret kdc: hostname: kdc.emqx.net - image: ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu20.04 + image: ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu20.04 container_name: kdc.emqx.net expose: - 88 # kdc diff --git a/.ci/docker-compose-file/docker-compose.yaml b/.ci/docker-compose-file/docker-compose.yaml index 504358419..9adbef02e 100644 --- a/.ci/docker-compose-file/docker-compose.yaml +++ b/.ci/docker-compose-file/docker-compose.yaml @@ -3,7 +3,7 @@ version: '3.9' services: erlang: container_name: erlang - image: ${DOCKER_CT_RUNNER_IMAGE:-ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu20.04} + image: ${DOCKER_CT_RUNNER_IMAGE:-ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu20.04} env_file: - conf.env environment: diff --git a/.github/actions/package-macos/action.yaml b/.github/actions/package-macos/action.yaml index 6b47ceafa..25edcb5f5 100644 --- a/.github/actions/package-macos/action.yaml +++ b/.github/actions/package-macos/action.yaml @@ -3,7 +3,7 @@ inputs: profile: # emqx, emqx-enterprise required: true type: string - otp: # 25.3.2-1 + otp: # 25.3.2-2 required: true type: string os: diff --git a/.github/workflows/_pr_entrypoint.yaml b/.github/workflows/_pr_entrypoint.yaml index 87c4d6145..7de9a64fd 100644 --- a/.github/workflows/_pr_entrypoint.yaml +++ b/.github/workflows/_pr_entrypoint.yaml @@ -17,7 +17,7 @@ env: jobs: sanity-checks: runs-on: ${{ github.repository_owner == 'emqx' && 'aws-amd64' || 'ubuntu-22.04' }} - container: "ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu22.04" + container: "ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu22.04" outputs: ct-matrix: ${{ steps.matrix.outputs.ct-matrix }} ct-host: ${{ steps.matrix.outputs.ct-host }} @@ -25,9 +25,9 @@ jobs: version-emqx: ${{ steps.matrix.outputs.version-emqx }} version-emqx-enterprise: ${{ steps.matrix.outputs.version-emqx-enterprise }} runner: ${{ github.repository_owner == 'emqx' && 'aws-amd64' || 'ubuntu-22.04' }} - builder: "ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu22.04" - builder_vsn: "5.1-3" - otp_vsn: "25.3.2-1" + builder: "ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu22.04" + builder_vsn: "5.1-4" + otp_vsn: "25.3.2-2" elixir_vsn: "1.14.5" steps: @@ -93,13 +93,13 @@ jobs: MATRIX="$(echo "${APPS}" | jq -c ' [ (.[] | select(.profile == "emqx") | . + { - builder: "5.1-3", - otp: "25.3.2-1", + builder: "5.1-4", + otp: "25.3.2-2", elixir: "1.14.5" }), (.[] | select(.profile == "emqx-enterprise") | . + { - builder: "5.1-3", - otp: ["25.3.2-1"][], + builder: "5.1-4", + otp: ["25.3.2-2"][], elixir: "1.14.5" }) ] diff --git a/.github/workflows/_push-entrypoint.yaml b/.github/workflows/_push-entrypoint.yaml index bc3bc486e..afdf2a050 100644 --- a/.github/workflows/_push-entrypoint.yaml +++ b/.github/workflows/_push-entrypoint.yaml @@ -21,7 +21,7 @@ env: jobs: prepare: runs-on: ${{ github.repository_owner == 'emqx' && 'aws-amd64' || 'ubuntu-22.04' }} - container: 'ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu22.04' + container: 'ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu22.04' outputs: profile: ${{ steps.parse-git-ref.outputs.profile }} release: ${{ steps.parse-git-ref.outputs.release }} @@ -31,9 +31,9 @@ jobs: ct-host: ${{ steps.matrix.outputs.ct-host }} ct-docker: ${{ steps.matrix.outputs.ct-docker }} runner: ${{ github.repository_owner == 'emqx' && 'aws-amd64' || 'ubuntu-22.04' }} - builder: 'ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu22.04' - builder_vsn: '5.1-3' - otp_vsn: '25.3.2-1' + builder: 'ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu22.04' + builder_vsn: '5.1-4' + otp_vsn: '25.3.2-2' elixir_vsn: '1.14.5' steps: @@ -64,13 +64,13 @@ jobs: MATRIX="$(echo "${APPS}" | jq -c ' [ (.[] | select(.profile == "emqx") | . + { - builder: "5.1-3", - otp: "25.3.2-1", + builder: "5.1-4", + otp: "25.3.2-2", elixir: "1.14.5" }), (.[] | select(.profile == "emqx-enterprise") | . + { - builder: "5.1-3", - otp: ["25.3.2-1"][], + builder: "5.1-4", + otp: ["25.3.2-2"][], elixir: "1.14.5" }) ] diff --git a/.github/workflows/build_and_push_docker_images.yaml b/.github/workflows/build_and_push_docker_images.yaml index b2bfe735b..3f568e430 100644 --- a/.github/workflows/build_and_push_docker_images.yaml +++ b/.github/workflows/build_and_push_docker_images.yaml @@ -61,7 +61,7 @@ on: otp_vsn: required: false type: string - default: '25.3.2-1' + default: '25.3.2-2' elixir_vsn: required: false type: string @@ -69,7 +69,7 @@ on: builder_vsn: required: false type: string - default: '5.1-3' + default: '5.1-4' runner: required: false type: string diff --git a/.github/workflows/build_packages.yaml b/.github/workflows/build_packages.yaml index d33d46f11..d482d2c0e 100644 --- a/.github/workflows/build_packages.yaml +++ b/.github/workflows/build_packages.yaml @@ -57,7 +57,7 @@ on: otp_vsn: required: false type: string - default: '25.3.2-1' + default: '25.3.2-2' elixir_vsn: required: false type: string @@ -69,7 +69,7 @@ on: builder_vsn: required: false type: string - default: '5.1-3' + default: '5.1-4' jobs: windows: diff --git a/.github/workflows/build_packages_cron.yaml b/.github/workflows/build_packages_cron.yaml index 431c4f5c4..a67ab81d2 100644 --- a/.github/workflows/build_packages_cron.yaml +++ b/.github/workflows/build_packages_cron.yaml @@ -24,7 +24,7 @@ jobs: - ['emqx-enterprise', 'release-51'] - ['emqx-enterprise', 'release-52'] otp: - - 25.3.2-1 + - 25.3.2-2 arch: - amd64 os: @@ -32,7 +32,7 @@ jobs: - ubuntu22.04 - amzn2023 builder: - - 5.1-3 + - 5.1-4 elixir: - 1.14.5 @@ -99,7 +99,7 @@ jobs: branch: - master otp: - - 25.3.2-1 + - 25.3.2-2 os: - macos-13 - macos-12-arm64 diff --git a/.github/workflows/build_slim_packages.yaml b/.github/workflows/build_slim_packages.yaml index b7ba78ef4..8e13ec0a1 100644 --- a/.github/workflows/build_slim_packages.yaml +++ b/.github/workflows/build_slim_packages.yaml @@ -34,15 +34,15 @@ on: builder: required: false type: string - default: 'ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu22.04' + default: 'ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu22.04' builder_vsn: required: false type: string - default: '5.1-3' + default: '5.1-4' otp_vsn: required: false type: string - default: '25.3.2-1' + default: '25.3.2-2' elixir_vsn: required: false type: string @@ -58,8 +58,8 @@ jobs: fail-fast: false matrix: profile: - - ["emqx", "25.3.2-1", "ubuntu20.04", "elixir"] - - ["emqx-enterprise", "25.3.2-1", "ubuntu20.04", "erlang"] + - ["emqx", "25.3.2-2", "ubuntu20.04", "elixir"] + - ["emqx-enterprise", "25.3.2-2", "ubuntu20.04", "erlang"] container: "ghcr.io/emqx/emqx-builder/${{ inputs.builder_vsn }}:${{ inputs.elixir_vsn }}-${{ matrix.profile[1] }}-${{ matrix.profile[2] }}" diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml index 6d4cc3dc4..a0b701d17 100644 --- a/.github/workflows/codeql.yaml +++ b/.github/workflows/codeql.yaml @@ -18,7 +18,7 @@ jobs: contents: read security-events: write container: - image: ghcr.io/emqx/emqx-builder/5.1-1:1.14.5-25.3.2-1-ubuntu22.04 + image: ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu22.04 strategy: fail-fast: false diff --git a/.github/workflows/performance_test.yaml b/.github/workflows/performance_test.yaml index 10b040271..224cfb0b3 100644 --- a/.github/workflows/performance_test.yaml +++ b/.github/workflows/performance_test.yaml @@ -23,7 +23,7 @@ jobs: prepare: runs-on: ubuntu-latest if: github.repository_owner == 'emqx' - container: ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu20.04 + container: ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu20.04 outputs: BENCH_ID: ${{ steps.prepare.outputs.BENCH_ID }} PACKAGE_FILE: ${{ steps.package_file.outputs.PACKAGE_FILE }} diff --git a/.tool-versions b/.tool-versions index 3a2251dc8..a988325fa 100644 --- a/.tool-versions +++ b/.tool-versions @@ -1,2 +1,2 @@ -erlang 25.3.2-1 +erlang 25.3.2-2 elixir 1.14.5-otp-25 diff --git a/Makefile b/Makefile index 2f3d7067c..fc9286837 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ REBAR = $(CURDIR)/rebar3 BUILD = $(CURDIR)/build SCRIPTS = $(CURDIR)/scripts export EMQX_RELUP ?= true -export EMQX_DEFAULT_BUILDER = ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-debian11 +export EMQX_DEFAULT_BUILDER = ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-debian11 export EMQX_DEFAULT_RUNNER = debian:11-slim export EMQX_REL_FORM ?= tgz export QUICER_DOWNLOAD_FROM_RELEASE = 1 diff --git a/build b/build index 03d1ce673..874e4088c 100755 --- a/build +++ b/build @@ -369,9 +369,9 @@ docker_cleanup() { ## Build the default docker image based on debian 11. make_docker() { - local EMQX_BUILDER_VERSION="${EMQX_BUILDER_VERSION:-5.1-3}" + local EMQX_BUILDER_VERSION="${EMQX_BUILDER_VERSION:-5.1-4}" local EMQX_BUILDER_PLATFORM="${EMQX_BUILDER_PLATFORM:-debian11}" - local EMQX_BUILDER_OTP="${EMQX_BUILDER_OTP:-25.3.2-1}" + local EMQX_BUILDER_OTP="${EMQX_BUILDER_OTP:-25.3.2-2}" local EMQX_BUILDER_ELIXIR="${EMQX_BUILDER_ELIXIR:-1.14.5}" local EMQX_BUILDER=${EMQX_BUILDER:-ghcr.io/emqx/emqx-builder/${EMQX_BUILDER_VERSION}:${EMQX_BUILDER_ELIXIR}-${EMQX_BUILDER_OTP}-${EMQX_BUILDER_PLATFORM}} local EMQX_RUNNER="${EMQX_RUNNER:-${EMQX_DEFAULT_RUNNER}}" diff --git a/changes/ce/fix-11499.en.md b/changes/ce/fix-11499.en.md new file mode 100644 index 000000000..3ed4d1e15 --- /dev/null +++ b/changes/ce/fix-11499.en.md @@ -0,0 +1,3 @@ +Upgrade Erlang/OTP to 25.3.2-2 + +Erlang/OTP 25.3.2-2 excludes sensitive data from mnesia_hook log message. diff --git a/deploy/docker/Dockerfile b/deploy/docker/Dockerfile index 61a143cae..76ded75eb 100644 --- a/deploy/docker/Dockerfile +++ b/deploy/docker/Dockerfile @@ -1,4 +1,4 @@ -ARG BUILD_FROM=ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-debian11 +ARG BUILD_FROM=ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-debian11 ARG RUN_FROM=debian:11-slim FROM ${BUILD_FROM} AS builder diff --git a/scripts/buildx.sh b/scripts/buildx.sh index 462ab6612..662a7233c 100755 --- a/scripts/buildx.sh +++ b/scripts/buildx.sh @@ -9,7 +9,7 @@ ## example: ## ./scripts/buildx.sh --profile emqx --pkgtype tgz --arch arm64 \ -## --builder ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-debian11 +## --builder ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-debian11 set -euo pipefail @@ -24,7 +24,7 @@ help() { echo "--arch amd64|arm64: Target arch to build the EMQX package for" echo "--src_dir : EMQX source code in this dir, default to PWD" echo "--builder : Builder image to pull" - echo " E.g. ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-debian11" + echo " E.g. ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-debian11" } die() { diff --git a/scripts/pr-sanity-checks.sh b/scripts/pr-sanity-checks.sh index 6b193b74e..19321230b 100755 --- a/scripts/pr-sanity-checks.sh +++ b/scripts/pr-sanity-checks.sh @@ -12,8 +12,8 @@ if ! type "yq" > /dev/null; then exit 1 fi -EMQX_BUILDER_VERSION=${EMQX_BUILDER_VERSION:-5.1-3} -EMQX_BUILDER_OTP=${EMQX_BUILDER_OTP:-25.3.2-1} +EMQX_BUILDER_VERSION=${EMQX_BUILDER_VERSION:-5.1-4} +EMQX_BUILDER_OTP=${EMQX_BUILDER_OTP:-25.3.2-2} EMQX_BUILDER_ELIXIR=${EMQX_BUILDER_ELIXIR:-1.14.5} EMQX_BUILDER_PLATFORM=${EMQX_BUILDER_PLATFORM:-ubuntu22.04} EMQX_BUILDER=${EMQX_BUILDER:-ghcr.io/emqx/emqx-builder/${EMQX_BUILDER_VERSION}:${EMQX_BUILDER_ELIXIR}-${EMQX_BUILDER_OTP}-${EMQX_BUILDER_PLATFORM}} diff --git a/scripts/relup-test/start-relup-test-cluster.sh b/scripts/relup-test/start-relup-test-cluster.sh index 9cc0eaffe..2cee1394e 100755 --- a/scripts/relup-test/start-relup-test-cluster.sh +++ b/scripts/relup-test/start-relup-test-cluster.sh @@ -22,7 +22,7 @@ WEBHOOK="webhook.$NET" BENCH="bench.$NET" COOKIE='this-is-a-secret' ## Erlang image is needed to run webhook server and emqtt-bench -ERLANG_IMAGE="ghcr.io/emqx/emqx-builder/5.1-3:1.14.5-25.3.2-1-ubuntu20.04" +ERLANG_IMAGE="ghcr.io/emqx/emqx-builder/5.1-4:1.14.5-25.3.2-2-ubuntu20.04" # builder has emqtt-bench installed BENCH_IMAGE="$ERLANG_IMAGE" From 7adbf319b10784385975d0c3d00c3195a4b9a126 Mon Sep 17 00:00:00 2001 From: Ivan Dyachkov Date: Thu, 24 Aug 2023 08:13:49 +0200 Subject: [PATCH 34/85] ci: use more sensible name for checks --- .github/workflows/run_test_cases.yaml | 14 ++++++++++++++ .github/workflows/static_checks.yaml | 1 + 2 files changed, 15 insertions(+) diff --git a/.github/workflows/run_test_cases.yaml b/.github/workflows/run_test_cases.yaml index 48e551612..82b2bbeb9 100644 --- a/.github/workflows/run_test_cases.yaml +++ b/.github/workflows/run_test_cases.yaml @@ -29,6 +29,7 @@ env: jobs: eunit_and_proper: runs-on: ${{ inputs.runner }} + name: "eunit_and_proper (${{ matrix.profile }})" strategy: fail-fast: false matrix: @@ -69,6 +70,7 @@ jobs: ct_docker: runs-on: ${{ inputs.runner }} + name: "ct_docker (${{ matrix.app }}-${{ matrix.suitegroup }})" strategy: fail-fast: false matrix: @@ -116,6 +118,7 @@ jobs: ct: runs-on: ${{ inputs.runner }} + name: "ct (${{ matrix.app }}-${{ matrix.suitegroup }})" strategy: fail-fast: false matrix: @@ -155,6 +158,17 @@ jobs: name: logs-${{ matrix.profile }}-${{ matrix.prefix }}-${{ matrix.otp }}-sg${{ matrix.suitegroup }} path: _build/test/logs + tests_passed: + needs: + - eunit_and_proper + - ct + - ct_docker + runs-on: ${{ inputs.runner }} + strategy: + fail-fast: false + steps: + - run: echo "All tests passed" + make_cover: needs: - eunit_and_proper diff --git a/.github/workflows/static_checks.yaml b/.github/workflows/static_checks.yaml index 3b32a36b4..21a753a37 100644 --- a/.github/workflows/static_checks.yaml +++ b/.github/workflows/static_checks.yaml @@ -23,6 +23,7 @@ env: jobs: static_checks: runs-on: ${{ inputs.runner }} + name: "static_checks (${{ matrix.profile }})" strategy: fail-fast: false matrix: From 9fe5080705fe3ab4ce640c7cf6bbec7215730578 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Thu, 24 Aug 2023 16:46:27 +0200 Subject: [PATCH 35/85] build: bump versions before release --- apps/emqx/include/emqx_release.hrl | 2 +- deploy/charts/emqx/Chart.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/emqx/include/emqx_release.hrl b/apps/emqx/include/emqx_release.hrl index c7fa97be7..d66c7982a 100644 --- a/apps/emqx/include/emqx_release.hrl +++ b/apps/emqx/include/emqx_release.hrl @@ -32,7 +32,7 @@ %% `apps/emqx/src/bpapi/README.md' %% Opensource edition --define(EMQX_RELEASE_CE, "5.1.5-build.3"). +-define(EMQX_RELEASE_CE, "5.1.6"). %% Enterprise edition -define(EMQX_RELEASE_EE, "5.2.0-alpha.3"). diff --git a/deploy/charts/emqx/Chart.yaml b/deploy/charts/emqx/Chart.yaml index 1451347e2..f8cd69735 100644 --- a/deploy/charts/emqx/Chart.yaml +++ b/deploy/charts/emqx/Chart.yaml @@ -14,8 +14,8 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 5.1.5-build.3 +version: 5.1.6 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. -appVersion: 5.1.5-build.3 +appVersion: 5.1.6 From fa6a98e2a3312bc40c117a2d35db71635bbce185 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Thu, 24 Aug 2023 16:50:35 +0200 Subject: [PATCH 36/85] chore: fix format of file to pass format check --- scripts/check_missing_reboot_apps.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/check_missing_reboot_apps.exs b/scripts/check_missing_reboot_apps.exs index d9933e099..91d4b39ea 100755 --- a/scripts/check_missing_reboot_apps.exs +++ b/scripts/check_missing_reboot_apps.exs @@ -24,10 +24,10 @@ apps = :xref.start(:xref) :xref.set_default(:xref, warnings: false) -rel_dir = '_build/#{profile}/lib/' +rel_dir = ~c"_build/#{profile}/lib/" :xref.add_release(:xref, rel_dir) -{:ok, calls} = :xref.q(:xref, '(App) (XC | [#{Enum.join(apps, ",")}] || mria:create_table/_)') +{:ok, calls} = :xref.q(:xref, ~c"(App) (XC | [#{Enum.join(apps, ",")}] || mria:create_table/_)") emqx_calls = calls From 237d1ae255bd675649bdbbd906d43a6adb8b3a37 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Thu, 24 Aug 2023 16:51:51 +0200 Subject: [PATCH 37/85] docs: Generate changelog for v5.1.6 --- changes/v5.1.6.en.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 changes/v5.1.6.en.md diff --git a/changes/v5.1.6.en.md b/changes/v5.1.6.en.md new file mode 100644 index 000000000..3c393c55b --- /dev/null +++ b/changes/v5.1.6.en.md @@ -0,0 +1,30 @@ +# v5.1.6 + +## Enhancements + +- [#11429](https://github.com/emqx/emqx/pull/11429) Added option to configure detection of legacy protocol in MondoDB connectors and bridges. + +- [#11436](https://github.com/emqx/emqx/pull/11436) Add a new API endpoint `DELETE /banned` to clear all `banned` data. + +- [#11438](https://github.com/emqx/emqx/pull/11438) Changed the type of the `mqtt.mqx_packet_size` from string to byteSize to better represent the valid numeric range. + Strings will still be accepted for backwards compatibility. + +- [#11446](https://github.com/emqx/emqx/pull/11446) Refactored datetime-related modules and functions to simplify the code. + +- [#11396](https://github.com/emqx/emqx/pull/11396) Introduce topic index for the rule engine runtime that significantly improves the performance of EMQX with a non-trivial number of rules consuming messages matching different topic filters. + +## Bug Fixes + +- [#11424](https://github.com/emqx/emqx/pull/11424) Add a check for the maximum value of the timestamp in the API to ensure it is a valid Unix timestamp. + +- [#11445](https://github.com/emqx/emqx/pull/11445) Removed os_mon application monitor support on Windows platforms to prevent VM crashes. + Functionality remains on non-Windows platforms. + +- [#11454](https://github.com/emqx/emqx/pull/11454) Fixed crashing when debugging/tracing with large payloads(introduce when [#11279](https://github.com/emqx/emqx/pull/11279)) + +- [#11456](https://github.com/emqx/emqx/pull/11456) Removed validation that enforced non-empty PEM for CA cert file. + CA certificate file PEM can now be empty. + +- [#11499](https://github.com/emqx/emqx/pull/11499) Upgrade Erlang/OTP to 25.3.2-2 + + Erlang/OTP 25.3.2-2 excludes sensitive data from mnesia_hook log message. From c65db82b07bc13dbd0d5c34ead103d9eb223dd5b Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Fri, 25 Aug 2023 11:01:23 +0200 Subject: [PATCH 38/85] fix: bad error message when rule engine schema name is too long Fixes: https://emqx.atlassian.net/browse/EMQX-10778 --- .../src/emqx_schema_registry.app.src | 2 +- .../src/emqx_schema_registry.erl | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/apps/emqx_schema_registry/src/emqx_schema_registry.app.src b/apps/emqx_schema_registry/src/emqx_schema_registry.app.src index b79c9ec01..9145f5dc0 100644 --- a/apps/emqx_schema_registry/src/emqx_schema_registry.app.src +++ b/apps/emqx_schema_registry/src/emqx_schema_registry.app.src @@ -1,6 +1,6 @@ {application, emqx_schema_registry, [ {description, "EMQX Schema Registry"}, - {vsn, "0.1.5"}, + {vsn, "0.1.6"}, {registered, [emqx_schema_registry_sup]}, {mod, {emqx_schema_registry_app, []}}, {included_applications, [ diff --git a/apps/emqx_schema_registry/src/emqx_schema_registry.erl b/apps/emqx_schema_registry/src/emqx_schema_registry.erl index 3f09ac347..6face86d6 100644 --- a/apps/emqx_schema_registry/src/emqx_schema_registry.erl +++ b/apps/emqx_schema_registry/src/emqx_schema_registry.erl @@ -64,7 +64,7 @@ get_serde(SchemaName) -> get_schema(SchemaName) -> case emqx_config:get( - [?CONF_KEY_ROOT, schemas, binary_to_atom(SchemaName)], undefined + [?CONF_KEY_ROOT, schemas, schema_name_bin_to_atom(SchemaName)], undefined ) of undefined -> @@ -333,6 +333,20 @@ async_delete_serdes(Names) -> to_bin(A) when is_atom(A) -> atom_to_binary(A); to_bin(B) when is_binary(B) -> B. +schema_name_bin_to_atom(Bin) when size(Bin) > 255 -> + erlang:throw( + iolist_to_binary( + io_lib:format( + "Name is is too long." + " Please provide a shorter name (<= 255 bytes)." + " The name that is too long: \"~s\"", + [Bin] + ) + ) + ); +schema_name_bin_to_atom(Bin) -> + binary_to_atom(Bin, utf8). + -spec serde_to_map(serde()) -> serde_map(). serde_to_map(#serde{} = Serde) -> #{ From 00b2712f29517cd4f1a4a4e8908f21861520565a Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Fri, 25 Aug 2023 11:09:44 +0200 Subject: [PATCH 39/85] docs: changelog entry for improved too long schema name fix --- changes/ce/fix-11522.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ce/fix-11522.en.md diff --git a/changes/ce/fix-11522.en.md b/changes/ce/fix-11522.en.md new file mode 100644 index 000000000..fdb56b4e2 --- /dev/null +++ b/changes/ce/fix-11522.en.md @@ -0,0 +1 @@ +Improved error message for rule engine schema registry when schema name exceeds permissible length. From f15f59650d833f82e5f037688ef01c0ceb2e6edf Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 25 Aug 2023 13:49:33 -0300 Subject: [PATCH 40/85] test: rm obselete workaround code --- apps/emqx/test/emqx_persistent_messages_SUITE.erl | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index bb2921de1..db22b19e6 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -250,9 +250,6 @@ app_specs() -> [ emqx_durable_storage, {emqx, #{ - before_start => fun() -> - emqx_app:set_config_loader(?MODULE) - end, config => #{persistent_session_store => #{ds => true}}, override_env => [{boot_modules, [broker, listeners]}] }} From 33a0048155b66bc78bbdfac7502553caa701b279 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Fri, 25 Aug 2023 15:24:53 -0300 Subject: [PATCH 41/85] refactor: move logic to `ensure_iterator` --- apps/emqx/src/emqx_persistent_session_ds.erl | 12 +++------- .../src/emqx_ds_storage_layer.erl | 23 +++++++++++-------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index b862ef0d9..dc615fd5b 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -123,18 +123,12 @@ open_iterator_on_all_shards(TopicFilter, StartMS, IteratorID) -> true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), ok. +%% RPC target. -spec do_open_iterator(emqx_topic:words(), emqx_ds:time(), emqx_ds:iterator_id()) -> ok. do_open_iterator(TopicFilter, StartMS, IteratorID) -> Replay = {TopicFilter, StartMS}, - case emqx_ds_storage_layer:is_iterator_present(?DS_SHARD, IteratorID) of - true -> - {ok, _It} = emqx_ds_storage_layer:restore_iterator(?DS_SHARD, IteratorID), - ok; - false -> - {ok, It} = emqx_ds_storage_layer:make_iterator(?DS_SHARD, Replay), - ok = emqx_ds_storage_layer:preserve_iterator(It, IteratorID), - ok - end. + {ok, _It} = emqx_ds_storage_layer:ensure_iterator(?DS_SHARD, IteratorID, Replay), + ok. %% diff --git a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl index 69c0e008c..47c29e170 100644 --- a/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl +++ b/apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl @@ -17,7 +17,7 @@ preserve_iterator/2, restore_iterator/2, discard_iterator/2, - is_iterator_present/2, + ensure_iterator/3, discard_iterator_prefix/2, list_iterator_prefix/2, foldl_iterator_prefix/4 @@ -185,15 +185,18 @@ restore_iterator(Shard, ReplayID) -> Error end. --spec is_iterator_present(emqx_ds:shard(), emqx_ds:replay_id()) -> - boolean(). -is_iterator_present(Shard, ReplayID) -> - %% TODO: use keyMayExist after added to wrapper? - case iterator_get_state(Shard, ReplayID) of - {ok, _} -> - true; - _ -> - false +-spec ensure_iterator(emqx_ds:shard(), emqx_ds:iterator_id(), emqx_ds:replay()) -> + {ok, iterator()} | {error, _TODO}. +ensure_iterator(Shard, IteratorID, Replay = {_TopicFilter, _StartMS}) -> + case restore_iterator(Shard, IteratorID) of + {ok, It} -> + {ok, It}; + {error, not_found} -> + {ok, It} = make_iterator(Shard, Replay), + ok = emqx_ds_storage_layer:preserve_iterator(It, IteratorID), + {ok, It}; + Error -> + Error end. -spec discard_iterator(emqx_ds:shard(), emqx_ds:replay_id()) -> From b0d4a22aa850db1a6f81637db7f011a0f86d3225 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Fri, 18 Aug 2023 21:10:20 +0300 Subject: [PATCH 42/85] chore(ft): refactor async reply mechanism --- apps/emqx/src/emqx_channel.erl | 15 +- apps/emqx/src/emqx_cm.erl | 2 +- apps/emqx_ft/src/emqx_ft.erl | 181 +++++++++--------- apps/emqx_ft/src/emqx_ft_app.erl | 1 + apps/emqx_ft/src/emqx_ft_async_reply.erl | 106 ++++++++++ apps/emqx_ft/src/emqx_ft_responder.erl | 116 ----------- apps/emqx_ft/src/emqx_ft_responder_sup.erl | 48 ----- apps/emqx_ft/src/emqx_ft_storage.erl | 8 + apps/emqx_ft/src/emqx_ft_sup.erl | 11 +- apps/emqx_ft/test/emqx_ft_SUITE.erl | 2 +- apps/emqx_ft/test/emqx_ft_responder_SUITE.erl | 84 -------- 11 files changed, 221 insertions(+), 353 deletions(-) create mode 100644 apps/emqx_ft/src/emqx_ft_async_reply.erl delete mode 100644 apps/emqx_ft/src/emqx_ft_responder.erl delete mode 100644 apps/emqx_ft/src/emqx_ft_responder_sup.erl delete mode 100644 apps/emqx_ft/test/emqx_ft_responder_SUITE.erl diff --git a/apps/emqx/src/emqx_channel.erl b/apps/emqx/src/emqx_channel.erl index 87680358f..8d0a58767 100644 --- a/apps/emqx/src/emqx_channel.erl +++ b/apps/emqx/src/emqx_channel.erl @@ -1252,6 +1252,11 @@ handle_info({disconnect, ReasonCode, ReasonName, Props}, Channel) -> handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel); handle_info({puback, PacketId, PubRes, RC}, Channel) -> do_finish_publish(PacketId, PubRes, RC, Channel); +handle_info({'DOWN', Ref, process, Pid, Reason}, Channel) -> + case emqx_hooks:run_fold('client.monitored_process_down', [Ref, Pid, Reason], []) of + [] -> {ok, Channel}; + Msgs -> {ok, Msgs, Channel} + end; handle_info(Info, Channel) -> ?SLOG(error, #{msg => "unexpected_info", info => Info}), {ok, Channel}. @@ -1358,9 +1363,13 @@ handle_timeout( {_, Quota2} -> {ok, clean_timer(quota_timer, Channel#channel{quota = Quota2})} end; -handle_timeout(_TRef, Msg, Channel) -> - ?SLOG(error, #{msg => "unexpected_timeout", timeout_msg => Msg}), - {ok, Channel}. +handle_timeout(TRef, Msg, Channel) -> + case emqx_hooks:run_fold('client.timeout', [TRef, Msg], []) of + [] -> + {ok, Channel}; + Msgs -> + {ok, Msgs, Channel} + end. %%-------------------------------------------------------------------- %% Ensure timers diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index c680560fb..4e4afa678 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -189,7 +189,7 @@ do_unregister_channel({_ClientId, ChanPid} = Chan) -> true = ets:delete(?CHAN_CONN_TAB, Chan), true = ets:delete(?CHAN_INFO_TAB, Chan), ets:delete_object(?CHAN_TAB, Chan), - ok = emqx_hooks:run('channel.unregistered', [ChanPid]), + ok = emqx_hooks:run('cm.channel.unregistered', [ChanPid]), true. -spec connection_closed(emqx_types:clientid()) -> true. diff --git a/apps/emqx_ft/src/emqx_ft.erl b/apps/emqx_ft/src/emqx_ft.erl index 41046907b..521d5b10a 100644 --- a/apps/emqx_ft/src/emqx_ft.erl +++ b/apps/emqx_ft/src/emqx_ft.erl @@ -28,7 +28,10 @@ -export([ on_message_publish/1, - on_message_puback/4 + on_message_puback/4, + on_client_timeout/3, + on_process_down/4, + on_channel_unregistered/1 ]). -export([ @@ -36,8 +39,6 @@ encode_filemeta/1 ]). --export([on_complete/4]). - -export_type([ clientid/0, transfer/0, @@ -85,17 +86,29 @@ checksum => checksum() }. +-define(FT_EVENT(EVENT), {?MODULE, EVENT}). + %%-------------------------------------------------------------------- %% API for app %%-------------------------------------------------------------------- hook() -> ok = emqx_hooks:put('message.publish', {?MODULE, on_message_publish, []}, ?HP_LOWEST), - ok = emqx_hooks:put('message.puback', {?MODULE, on_message_puback, []}, ?HP_LOWEST). + ok = emqx_hooks:put('message.puback', {?MODULE, on_message_puback, []}, ?HP_LOWEST), + ok = emqx_hooks:put('client.timeout', {?MODULE, on_client_timeout, []}, ?HP_LOWEST), + ok = emqx_hooks:put( + 'client.monitored_process_down', {?MODULE, on_process_down, []}, ?HP_LOWEST + ), + ok = emqx_hooks:put( + 'cm.channel.unregistered', {?MODULE, on_channel_unregistered, []}, ?HP_LOWEST + ). unhook() -> ok = emqx_hooks:del('message.publish', {?MODULE, on_message_publish}), - ok = emqx_hooks:del('message.puback', {?MODULE, on_message_puback}). + ok = emqx_hooks:del('message.puback', {?MODULE, on_message_puback}), + ok = emqx_hooks:del('client.timeout', {?MODULE, on_client_timeout}), + ok = emqx_hooks:del('client.monitored_process_down', {?MODULE, on_process_down}), + ok = emqx_hooks:del('cm.channel.unregistered', {?MODULE, on_channel_unregistered}). %%-------------------------------------------------------------------- %% API @@ -145,6 +158,25 @@ on_message_puback(PacketId, #message{topic = Topic} = Msg, _PubRes, _RC) -> ignore end. +on_channel_unregistered(ChannelPid) -> + ok = emqx_ft_async_reply:deregister_all(ChannelPid). + +on_client_timeout(_TRef, ?FT_EVENT({MRef, PacketId}), Acc) -> + _ = erlang:demonitor(MRef, [flush]), + _ = emqx_ft_async_reply:take_by_mref(MRef), + {ok, [{outgoing, ?PUBACK_PACKET(PacketId, ?RC_UNSPECIFIED_ERROR)} | Acc]}; +on_client_timeout(_TRef, _Event, Acc) -> + {ok, Acc}. + +on_process_down(MRef, _Pid, Reason, Acc) -> + case emqx_ft_async_reply:take_by_mref(MRef) of + {ok, PacketId, TRef} -> + _ = emqx_utils:cancel_timer(TRef), + {ok, [{outgoing, ?PUBACK_PACKET(PacketId, reason_to_rc(Reason))} | Acc]}; + not_found -> + {ok, Acc} + end. + %%-------------------------------------------------------------------- %% Handlers for transfer messages %%-------------------------------------------------------------------- @@ -208,24 +240,13 @@ on_init(PacketId, Msg, Transfer, Meta) -> transfer => Transfer, filemeta => Meta }), - PacketKey = {self(), PacketId}, - Callback = fun(Result) -> - ?MODULE:on_complete("store_filemeta", PacketKey, Transfer, Result) - end, - with_responder(PacketKey, Callback, emqx_ft_conf:init_timeout(), fun() -> - case store_filemeta(Transfer, Meta) of - % Stored, ack through the responder right away - ok -> - emqx_ft_responder:ack(PacketKey, ok); - % Storage operation started, packet will be acked by the responder - % {async, Pid} -> - % ok = emqx_ft_responder:kickoff(PacketKey, Pid), - % ok; - %% Storage operation failed, ack through the responder - {error, _} = Error -> - emqx_ft_responder:ack(PacketKey, Error) - end - end). + %% Currently synchronous. + %% If we want to make it async, we need to use `emqx_ft_async_reply`, + %% like in `on_fin`. + case store_filemeta(Transfer, Meta) of + ok -> ?RC_SUCCESS; + {error, _} -> ?RC_UNSPECIFIED_ERROR + end. on_abort(_Msg, _FileId) -> %% TODO @@ -240,21 +261,13 @@ on_segment(PacketId, Msg, Transfer, Offset, Checksum) -> checksum => Checksum }), Segment = {Offset, Msg#message.payload}, - PacketKey = {self(), PacketId}, - Callback = fun(Result) -> - ?MODULE:on_complete("store_segment", PacketKey, Transfer, Result) - end, - with_responder(PacketKey, Callback, emqx_ft_conf:store_segment_timeout(), fun() -> - case store_segment(Transfer, Segment) of - ok -> - emqx_ft_responder:ack(PacketKey, ok); - % {async, Pid} -> - % ok = emqx_ft_responder:kickoff(PacketKey, Pid), - % ok; - {error, _} = Error -> - emqx_ft_responder:ack(PacketKey, Error) - end - end). + %% Currently synchronous. + %% If we want to make it async, we need to use `emqx_ft_async_reply`, + %% like in `on_fin`. + case store_segment(Transfer, Segment) of + ok -> ?RC_SUCCESS; + {error, _} -> ?RC_UNSPECIFIED_ERROR + end. on_fin(PacketId, Msg, Transfer, FinalSize, FinalChecksum) -> ?tp(info, "file_transfer_fin", #{ @@ -265,37 +278,30 @@ on_fin(PacketId, Msg, Transfer, FinalSize, FinalChecksum) -> checksum => FinalChecksum }), %% TODO: handle checksum? Do we need it? - FinPacketKey = {self(), PacketId}, - Callback = fun(Result) -> - ?MODULE:on_complete("assemble", FinPacketKey, Transfer, Result) - end, - with_responder(FinPacketKey, Callback, emqx_ft_conf:assemble_timeout(), fun() -> - case assemble(Transfer, FinalSize, FinalChecksum) of - %% Assembling completed, ack through the responder right away - ok -> - emqx_ft_responder:ack(FinPacketKey, ok); - %% Assembling started, packet will be acked by the responder - {async, Pid} -> - ok = emqx_ft_responder:kickoff(FinPacketKey, Pid), - ok; - %% Assembling failed, ack through the responder - {error, _} = Error -> - emqx_ft_responder:ack(FinPacketKey, Error) - end - end). + emqx_ft_async_reply:with_new_packet( + PacketId, + fun() -> + case assemble(Transfer, FinalSize, FinalChecksum) of + ok -> + ?RC_SUCCESS; + %% Assembling started, packet will be acked by monitor or timeout + {async, Pid} -> + ok = register_async_reply(Pid, PacketId), + ok = emqx_ft_storage:kickoff(Pid), + undefined; + {error, _} -> + ?RC_UNSPECIFIED_ERROR + end + end, + undefined + ). -with_responder(Key, Callback, Timeout, CriticalSection) -> - case emqx_ft_responder:start(Key, Callback, Timeout) of - %% We have new packet - {ok, _} -> - CriticalSection(); - %% Packet already received. - %% Since we are still handling the previous one, - %% we probably have retransmit here - {error, {already_started, _}} -> - ok - end, - undefined. +register_async_reply(Pid, PacketId) -> + MRef = erlang:monitor(process, Pid), + TRef = erlang:start_timer( + emqx_ft_conf:assemble_timeout(), self(), ?FT_EVENT({MRef, PacketId}) + ), + ok = emqx_ft_async_reply:register(PacketId, MRef, TRef). store_filemeta(Transfer, Segment) -> try @@ -335,28 +341,6 @@ transfer(Msg, FileId) -> ClientId = Msg#message.from, {clientid_to_binary(ClientId), FileId}. -on_complete(Op, {ChanPid, PacketId}, Transfer, Result) -> - ?tp(debug, "on_complete", #{ - operation => Op, - packet_id => PacketId, - transfer => Transfer - }), - case Result of - {Mode, ok} when Mode == ack orelse Mode == down -> - erlang:send(ChanPid, {puback, PacketId, [], ?RC_SUCCESS}); - {Mode, {error, _} = Reason} when Mode == ack orelse Mode == down -> - ?tp(error, Op ++ "_failed", #{ - transfer => Transfer, - reason => Reason - }), - erlang:send(ChanPid, {puback, PacketId, [], ?RC_UNSPECIFIED_ERROR}); - timeout -> - ?tp(error, Op ++ "_timed_out", #{ - transfer => Transfer - }), - erlang:send(ChanPid, {puback, PacketId, [], ?RC_UNSPECIFIED_ERROR}) - end. - validate(Validations, Fun) -> case do_validate(Validations, []) of {ok, Parsed} -> @@ -429,3 +413,20 @@ clientid_to_binary(A) when is_atom(A) -> atom_to_binary(A); clientid_to_binary(B) when is_binary(B) -> B. + +reason_to_rc(Reason) -> + case map_down_reason(Reason) of + ok -> ?RC_SUCCESS; + {error, _} -> ?RC_UNSPECIFIED_ERROR + end. + +map_down_reason(normal) -> + ok; +map_down_reason(shutdown) -> + ok; +map_down_reason({shutdown, Result}) -> + Result; +map_down_reason(noproc) -> + {error, noproc}; +map_down_reason(Error) -> + {error, {internal_error, Error}}. diff --git a/apps/emqx_ft/src/emqx_ft_app.erl b/apps/emqx_ft/src/emqx_ft_app.erl index 43a4cc816..9ef215bf9 100644 --- a/apps/emqx_ft/src/emqx_ft_app.erl +++ b/apps/emqx_ft/src/emqx_ft_app.erl @@ -22,6 +22,7 @@ start(_StartType, _StartArgs) -> {ok, Sup} = emqx_ft_sup:start_link(), + ok = emqx_ft_async_reply:create_table(), ok = emqx_ft_conf:load(), {ok, Sup}. diff --git a/apps/emqx_ft/src/emqx_ft_async_reply.erl b/apps/emqx_ft/src/emqx_ft_async_reply.erl new file mode 100644 index 000000000..4eee2c544 --- /dev/null +++ b/apps/emqx_ft/src/emqx_ft_async_reply.erl @@ -0,0 +1,106 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_ft_async_reply). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("stdlib/include/ms_transform.hrl"). + +-export([ + create_tables/0 +]). + +-export([ + register/3, + take_by_mref/1, + with_new_packet/3, + deregister_all/1 +]). + +-type channel_pid() :: pid(). +-type mon_ref() :: reference(). +-type timer_ref() :: reference(). +-type packet_id() :: emqx_types:packet_id(). + +%% packets waiting for async workers + +-define(WORKER_TAB, emqx_ft_async_mons). +-define(WORKER_KEY(MRef), ?WORKER_KEY(self(), MRef)). +-define(WORKER_KEY(ChannelPid, MRef), {ChannelPid, MRef}). + +%% async worker monitors by packet ids + +-define(PACKET_TAB, emqx_ft_async_packets). +-define(PACKET_KEY(PacketId), ?PACKET_KEY(self(), PacketId)). +-define(PACKET_KEY(ChannelPid, PacketId), {ChannelPid, PacketId}). + +%%-------------------------------------------------------------------- +%% API +%% ------------------------------------------------------------------- + +-spec create_tables() -> ok. +create_tables() -> + _ = ets:new(?WORKER_TAB, [named_table, public, ordered_set]), + _ = ets:new(?PACKET_TAB, [named_table, public, ordered_set]), + ok. + +-spec register(packet_id(), mon_ref(), timer_ref()) -> ok. +register(PacketId, MRef, TRef) -> + _ = ets:insert(?PACKET_TAB, {?PACKET_KEY(PacketId), MRef}), + _ = ets:insert(?WORKER_TAB, {?WORKER_KEY(MRef), PacketId, TRef}), + ok. + +-spec with_new_packet(packet_id(), fun(() -> any()), any()) -> any(). +with_new_packet(PacketId, Fun, Default) -> + case ets:member(?PACKET_TAB, ?PACKET_KEY(PacketId)) of + true -> Default; + false -> Fun() + end. + +-spec take_by_mref(mon_ref()) -> {ok, packet_id(), timer_ref()} | not_found. +take_by_mref(MRef) -> + case ets:take(?WORKER_TAB, ?WORKER_KEY(MRef)) of + [{_, PacketId, TRef}] -> + _ = ets:delete(?PACKET_TAB, ?PACKET_KEY(PacketId)), + {ok, PacketId, TRef}; + [] -> + not_found + end. + +-spec deregister_all(channel_pid()) -> ok. +deregister_all(ChannelPid) -> + ok = deregister_packets(ChannelPid), + ok = deregister_mons(ChannelPid), + ok. + +-spec info() -> {non_neg_integer(), non_neg_integer()}. +info() -> + {ets:info(?MON_TAB, size), ets:info(?PACKET_TAB, size)}. + +%%-------------------------------------------------------------------- +%% Internal +%%------------------------------------------------------------------- + +deregister_packets(ChannelPid) when is_pid(ChannelPid) -> + MS = [{{?PACKET_KEY(ChannelPid, '_'), '_'}, [], [true]}], + _ = ets:select_delete(?PACKET_TAB, MS), + ok. + +deregister_mons(ChannelPid) -> + MS = [{{?MON_KEY(ChannelPid, '_'), '_', '_'}, [], [true]}], + _ = ets:select_delete(?MON_TAB, MS), + ok. diff --git a/apps/emqx_ft/src/emqx_ft_responder.erl b/apps/emqx_ft/src/emqx_ft_responder.erl deleted file mode 100644 index c2c62e1c2..000000000 --- a/apps/emqx_ft/src/emqx_ft_responder.erl +++ /dev/null @@ -1,116 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. -%%-------------------------------------------------------------------- - --module(emqx_ft_responder). - --behaviour(gen_server). - --include_lib("emqx/include/logger.hrl"). --include_lib("emqx/include/types.hrl"). - --include_lib("snabbkaffe/include/snabbkaffe.hrl"). - -%% API --export([start/3]). --export([kickoff/2]). --export([ack/2]). - -%% Supervisor API --export([start_link/3]). - --export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). - --define(REF(Key), {via, gproc, {n, l, {?MODULE, Key}}}). - --type key() :: term(). --type respfun() :: fun(({ack, _Result} | {down, _Result} | timeout) -> _SideEffect). - -%%-------------------------------------------------------------------- -%% API -%% ------------------------------------------------------------------- - --spec start(key(), respfun(), timeout()) -> startlink_ret(). -start(Key, RespFun, Timeout) -> - emqx_ft_responder_sup:start_child(Key, RespFun, Timeout). - --spec kickoff(key(), pid()) -> ok. -kickoff(Key, Pid) -> - gen_server:call(?REF(Key), {kickoff, Pid}). - --spec ack(key(), _Result) -> _Return. -ack(Key, Result) -> - % TODO: it's possible to avoid term copy - gen_server:call(?REF(Key), {ack, Result}, infinity). - --spec start_link(key(), timeout(), respfun()) -> startlink_ret(). -start_link(Key, RespFun, Timeout) -> - gen_server:start_link(?REF(Key), ?MODULE, {Key, RespFun, Timeout}, []). - -%%-------------------------------------------------------------------- -%% gen_server callbacks -%% ------------------------------------------------------------------- - -init({Key, RespFun, Timeout}) -> - _ = erlang:process_flag(trap_exit, true), - _TRef = erlang:send_after(Timeout, self(), timeout), - {ok, {Key, RespFun}}. - -handle_call({kickoff, Pid}, _From, St) -> - % TODO: more state? - _MRef = erlang:monitor(process, Pid), - _ = Pid ! kickoff, - {reply, ok, St}; -handle_call({ack, Result}, _From, {Key, RespFun}) -> - Ret = apply(RespFun, [{ack, Result}]), - ?tp(debug, ft_responder_ack, #{key => Key, result => Result, return => Ret}), - {stop, {shutdown, Ret}, Ret, undefined}; -handle_call(Msg, _From, State) -> - ?SLOG(warning, #{msg => "unknown_call", call_msg => Msg}), - {reply, {error, unknown_call}, State}. - -handle_cast(Msg, State) -> - ?SLOG(warning, #{msg => "unknown_cast", cast_msg => Msg}), - {noreply, State}. - -handle_info(timeout, {Key, RespFun}) -> - Ret = apply(RespFun, [timeout]), - ?tp(debug, ft_responder_timeout, #{key => Key, return => Ret}), - {stop, {shutdown, Ret}, undefined}; -handle_info({'DOWN', _MRef, process, _Pid, Reason}, {Key, RespFun}) -> - Ret = apply(RespFun, [{down, map_down_reason(Reason)}]), - ?tp(debug, ft_responder_procdown, #{key => Key, reason => Reason, return => Ret}), - {stop, {shutdown, Ret}, undefined}; -handle_info(Msg, State) -> - ?SLOG(warning, #{msg => "unknown_message", info_msg => Msg}), - {noreply, State}. - -terminate(_Reason, undefined) -> - ok; -terminate(Reason, {Key, RespFun}) -> - Ret = apply(RespFun, [timeout]), - ?tp(debug, ft_responder_shutdown, #{key => Key, reason => Reason, return => Ret}), - ok. - -map_down_reason(normal) -> - ok; -map_down_reason(shutdown) -> - ok; -map_down_reason({shutdown, Result}) -> - Result; -map_down_reason(noproc) -> - {error, noproc}; -map_down_reason(Error) -> - {error, {internal_error, Error}}. diff --git a/apps/emqx_ft/src/emqx_ft_responder_sup.erl b/apps/emqx_ft/src/emqx_ft_responder_sup.erl deleted file mode 100644 index fb3932425..000000000 --- a/apps/emqx_ft/src/emqx_ft_responder_sup.erl +++ /dev/null @@ -1,48 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. -%%-------------------------------------------------------------------- - --module(emqx_ft_responder_sup). - --export([start_link/0]). --export([start_child/3]). - --behaviour(supervisor). --export([init/1]). - --define(SUPERVISOR, ?MODULE). - -%% - --spec start_link() -> {ok, pid()}. -start_link() -> - supervisor:start_link({local, ?SUPERVISOR}, ?MODULE, []). - -start_child(Key, RespFun, Timeout) -> - supervisor:start_child(?SUPERVISOR, [Key, RespFun, Timeout]). - --spec init(_) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}. -init(_) -> - Flags = #{ - strategy => simple_one_for_one, - intensity => 100, - period => 100 - }, - ChildSpec = #{ - id => responder, - start => {emqx_ft_responder, start_link, []}, - restart => temporary - }, - {ok, {Flags, [ChildSpec]}}. diff --git a/apps/emqx_ft/src/emqx_ft_storage.erl b/apps/emqx_ft/src/emqx_ft_storage.erl index 04fac3b38..506cf9789 100644 --- a/apps/emqx_ft/src/emqx_ft_storage.erl +++ b/apps/emqx_ft/src/emqx_ft_storage.erl @@ -23,6 +23,7 @@ store_filemeta/2, store_segment/2, assemble/3, + kickoff/1, files/0, files/1, @@ -121,6 +122,13 @@ store_segment(Transfer, Segment) -> assemble(Transfer, Size, FinOpts) -> dispatch(assemble, [Transfer, Size, FinOpts]). +-spec kickoff(pid()) -> ok. +kickoff(Pid) -> + _ = erlang:send(Pid, kickoff), + ok. + +%% + -spec files() -> {ok, page(file_info(), _)} | {error, term()}. files() -> diff --git a/apps/emqx_ft/src/emqx_ft_sup.erl b/apps/emqx_ft/src/emqx_ft_sup.erl index 0308668ab..512d534c3 100644 --- a/apps/emqx_ft/src/emqx_ft_sup.erl +++ b/apps/emqx_ft/src/emqx_ft_sup.erl @@ -52,14 +52,5 @@ init([]) -> modules => [emqx_ft_storage_fs_reader_sup] }, - Responder = #{ - id => emqx_ft_responder_sup, - start => {emqx_ft_responder_sup, start_link, []}, - restart => permanent, - shutdown => infinity, - type => worker, - modules => [emqx_ft_responder_sup] - }, - - ChildSpecs = [Responder, AssemblerSup, FileReaderSup], + ChildSpecs = [AssemblerSup, FileReaderSup], {ok, {SupFlags, ChildSpecs}}. diff --git a/apps/emqx_ft/test/emqx_ft_SUITE.erl b/apps/emqx_ft/test/emqx_ft_SUITE.erl index 290cda333..405ab86ba 100644 --- a/apps/emqx_ft/test/emqx_ft_SUITE.erl +++ b/apps/emqx_ft/test/emqx_ft_SUITE.erl @@ -37,7 +37,7 @@ all() -> groups() -> [ - {single_node, [parallel], [ + {single_node, [], [ t_assemble_crash, t_corrupted_segment_retry, t_invalid_checksum, diff --git a/apps/emqx_ft/test/emqx_ft_responder_SUITE.erl b/apps/emqx_ft/test/emqx_ft_responder_SUITE.erl deleted file mode 100644 index 751861206..000000000 --- a/apps/emqx_ft/test/emqx_ft_responder_SUITE.erl +++ /dev/null @@ -1,84 +0,0 @@ -%%-------------------------------------------------------------------- -%% Copyright (c) 2020-2023 EMQ Technologies Co., Ltd. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. -%%-------------------------------------------------------------------- - --module(emqx_ft_responder_SUITE). - --compile(export_all). --compile(nowarn_export_all). - --include_lib("stdlib/include/assert.hrl"). - -all() -> emqx_common_test_helpers:all(?MODULE). - -init_per_suite(Config) -> - ok = emqx_common_test_helpers:start_apps([emqx_ft], emqx_ft_test_helpers:env_handler(Config)), - Config. - -end_per_suite(_Config) -> - ok = emqx_common_test_helpers:stop_apps([emqx_ft]), - ok. - -init_per_testcase(_Case, Config) -> - Config. - -end_per_testcase(_Case, _Config) -> - ok. - -t_start_ack(_Config) -> - Key = <<"test">>, - DefaultAction = fun({ack, Ref}) -> Ref end, - ?assertMatch( - {ok, _Pid}, - emqx_ft_responder:start(Key, DefaultAction, 1000) - ), - ?assertMatch( - {error, {already_started, _Pid}}, - emqx_ft_responder:start(Key, DefaultAction, 1000) - ), - Ref = make_ref(), - ?assertEqual( - Ref, - emqx_ft_responder:ack(Key, Ref) - ), - ?assertExit( - {noproc, _}, - emqx_ft_responder:ack(Key, Ref) - ). - -t_timeout(_Config) -> - Key = <<"test">>, - Self = self(), - DefaultAction = fun(timeout) -> Self ! {timeout, Key} end, - {ok, _Pid} = emqx_ft_responder:start(Key, DefaultAction, 20), - receive - {timeout, Key} -> - ok - after 100 -> - ct:fail("emqx_ft_responder not called") - end, - ?assertExit( - {noproc, _}, - emqx_ft_responder:ack(Key, oops) - ). - -t_unknown_msgs(_Config) -> - {ok, Pid} = emqx_ft_responder:start(make_ref(), fun(_) -> ok end, 100), - Pid ! {unknown_msg, <<"test">>}, - ok = gen_server:cast(Pid, {unknown_msg, <<"test">>}), - ?assertEqual( - {error, unknown_call}, - gen_server:call(Pid, {unknown_call, <<"test">>}) - ). From b8cacd28336521db8f50a87898391bbd91c48b8e Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Tue, 22 Aug 2023 22:34:06 +0300 Subject: [PATCH 43/85] chore(ft): add tests for async reply registry --- apps/emqx/test/emqx_connection_SUITE.erl | 2 +- apps/emqx_ft/src/emqx_ft_app.erl | 2 +- apps/emqx_ft/src/emqx_ft_async_reply.erl | 15 +- .../test/emqx_ft_async_reply_SUITE.erl | 247 ++++++++++++++++++ 4 files changed, 257 insertions(+), 9 deletions(-) create mode 100644 apps/emqx_ft/test/emqx_ft_async_reply_SUITE.erl diff --git a/apps/emqx/test/emqx_connection_SUITE.erl b/apps/emqx/test/emqx_connection_SUITE.erl index ea451eea5..2a96594e1 100644 --- a/apps/emqx/test/emqx_connection_SUITE.erl +++ b/apps/emqx/test/emqx_connection_SUITE.erl @@ -49,7 +49,7 @@ init_per_suite(Config) -> %% Meck Hooks ok = meck:new(emqx_hooks, [passthrough, no_history, no_link]), ok = meck:expect(emqx_hooks, run, fun(_Hook, _Args) -> ok end), - ok = meck:expect(emqx_hooks, run_fold, fun(_Hook, _Args, Acc) -> {ok, Acc} end), + ok = meck:expect(emqx_hooks, run_fold, fun(_Hook, _Args, Acc) -> Acc end), ok = meck:expect(emqx_channel, ensure_disconnected, fun(_, Channel) -> Channel end), diff --git a/apps/emqx_ft/src/emqx_ft_app.erl b/apps/emqx_ft/src/emqx_ft_app.erl index 9ef215bf9..114b4bff3 100644 --- a/apps/emqx_ft/src/emqx_ft_app.erl +++ b/apps/emqx_ft/src/emqx_ft_app.erl @@ -22,7 +22,7 @@ start(_StartType, _StartArgs) -> {ok, Sup} = emqx_ft_sup:start_link(), - ok = emqx_ft_async_reply:create_table(), + ok = emqx_ft_async_reply:create_tables(), ok = emqx_ft_conf:load(), {ok, Sup}. diff --git a/apps/emqx_ft/src/emqx_ft_async_reply.erl b/apps/emqx_ft/src/emqx_ft_async_reply.erl index 4eee2c544..f33558434 100644 --- a/apps/emqx_ft/src/emqx_ft_async_reply.erl +++ b/apps/emqx_ft/src/emqx_ft_async_reply.erl @@ -21,7 +21,8 @@ -include_lib("stdlib/include/ms_transform.hrl"). -export([ - create_tables/0 + create_tables/0, + info/0 ]). -export([ @@ -38,9 +39,9 @@ %% packets waiting for async workers --define(WORKER_TAB, emqx_ft_async_mons). --define(WORKER_KEY(MRef), ?WORKER_KEY(self(), MRef)). --define(WORKER_KEY(ChannelPid, MRef), {ChannelPid, MRef}). +-define(MON_TAB, emqx_ft_async_mons). +-define(MON_KEY(MRef), ?MON_KEY(self(), MRef)). +-define(MON_KEY(ChannelPid, MRef), {ChannelPid, MRef}). %% async worker monitors by packet ids @@ -54,14 +55,14 @@ -spec create_tables() -> ok. create_tables() -> - _ = ets:new(?WORKER_TAB, [named_table, public, ordered_set]), + _ = ets:new(?MON_TAB, [named_table, public, ordered_set]), _ = ets:new(?PACKET_TAB, [named_table, public, ordered_set]), ok. -spec register(packet_id(), mon_ref(), timer_ref()) -> ok. register(PacketId, MRef, TRef) -> _ = ets:insert(?PACKET_TAB, {?PACKET_KEY(PacketId), MRef}), - _ = ets:insert(?WORKER_TAB, {?WORKER_KEY(MRef), PacketId, TRef}), + _ = ets:insert(?MON_TAB, {?MON_KEY(MRef), PacketId, TRef}), ok. -spec with_new_packet(packet_id(), fun(() -> any()), any()) -> any(). @@ -73,7 +74,7 @@ with_new_packet(PacketId, Fun, Default) -> -spec take_by_mref(mon_ref()) -> {ok, packet_id(), timer_ref()} | not_found. take_by_mref(MRef) -> - case ets:take(?WORKER_TAB, ?WORKER_KEY(MRef)) of + case ets:take(?MON_TAB, ?MON_KEY(MRef)) of [{_, PacketId, TRef}] -> _ = ets:delete(?PACKET_TAB, ?PACKET_KEY(PacketId)), {ok, PacketId, TRef}; diff --git a/apps/emqx_ft/test/emqx_ft_async_reply_SUITE.erl b/apps/emqx_ft/test/emqx_ft_async_reply_SUITE.erl new file mode 100644 index 000000000..78a9b371c --- /dev/null +++ b/apps/emqx_ft/test/emqx_ft_async_reply_SUITE.erl @@ -0,0 +1,247 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_ft_async_reply_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("stdlib/include/assert.hrl"). +-include_lib("emqx/include/asserts.hrl"). + +all() -> emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Apps = emqx_cth_suite:start( + [ + {emqx, #{override_env => [{boot_modules, [broker, listeners]}]}}, + {emqx_ft, "file_transfer { enable = true, assemble_timeout = 1s }"} + ], + #{work_dir => ?config(priv_dir, Config)} + ), + [{suite_apps, Apps} | Config]. + +end_per_suite(Config) -> + ok = emqx_cth_suite:stop(?config(suite_apps, Config)), + ok. + +init_per_testcase(_Case, Config) -> + ok = snabbkaffe:start_trace(), + Config. + +end_per_testcase(_Case, _Config) -> + ok = snabbkaffe:stop(), + ok. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_register(_Config) -> + PacketId = 1, + MRef = make_ref(), + TRef = make_ref(), + ok = emqx_ft_async_reply:register(PacketId, MRef, TRef), + + ?assertEqual( + undefined, + emqx_ft_async_reply:with_new_packet(PacketId, fun() -> ok end, undefined) + ), + + ?assertEqual( + ok, + emqx_ft_async_reply:with_new_packet(2, fun() -> ok end, undefined) + ), + + ?assertEqual( + {ok, PacketId, TRef}, + emqx_ft_async_reply:take_by_mref(MRef) + ). + +t_process_independence(_Config) -> + PacketId = 1, + MRef = make_ref(), + TRef = make_ref(), + ok = emqx_ft_async_reply:register(PacketId, MRef, TRef), + + Self = self(), + + spawn_link(fun() -> + Self ! emqx_ft_async_reply:take_by_mref(MRef) + end), + + Res1 = + receive + Msg1 -> Msg1 + end, + + ?assertEqual( + not_found, + Res1 + ), + + spawn_link(fun() -> + Self ! emqx_ft_async_reply:with_new_packet(PacketId, fun() -> ok end, undefined) + end), + + Res2 = + receive + Msg2 -> Msg2 + end, + + ?assertEqual( + ok, + Res2 + ). + +t_take(_Config) -> + PacketId = 1, + MRef = make_ref(), + TRef = make_ref(), + ok = emqx_ft_async_reply:register(PacketId, MRef, TRef), + + ?assertEqual( + {ok, PacketId, TRef}, + emqx_ft_async_reply:take_by_mref(MRef) + ), + + ?assertEqual( + not_found, + emqx_ft_async_reply:take_by_mref(MRef) + ), + + ?assertEqual( + ok, + emqx_ft_async_reply:with_new_packet(2, fun() -> ok end, undefined) + ). + +t_cleanup(_Config) -> + PacketId = 1, + MRef0 = make_ref(), + TRef0 = make_ref(), + MRef1 = make_ref(), + TRef1 = make_ref(), + ok = emqx_ft_async_reply:register(PacketId, MRef0, TRef0), + + Self = self(), + + Pid = spawn_link(fun() -> + ok = emqx_ft_async_reply:register(PacketId, MRef1, TRef1), + receive + kickoff -> + ?assertEqual( + undefined, + emqx_ft_async_reply:with_new_packet(PacketId, fun() -> ok end, undefined) + ), + + ?assertEqual( + {ok, PacketId, TRef1}, + emqx_ft_async_reply:take_by_mref(MRef1) + ), + + Self ! done + end + end), + + ?assertEqual( + undefined, + emqx_ft_async_reply:with_new_packet(PacketId, fun() -> ok end, undefined) + ), + + ok = emqx_ft_async_reply:deregister_all(Self), + + ?assertEqual( + ok, + emqx_ft_async_reply:with_new_packet(PacketId, fun() -> ok end, undefined) + ), + + Pid ! kickoff, + + receive + done -> ok + end. + +t_reply_by_tiemout(_Config) -> + process_flag(trap_exit, true), + ClientId = atom_to_binary(?FUNCTION_NAME), + C = emqx_ft_test_helpers:start_client(ClientId, node()), + + SleepForever = fun() -> + Ref = make_ref(), + receive + Ref -> ok + end + end, + + ok = meck:new(emqx_ft_storage, [passthrough]), + meck:expect(emqx_ft_storage, assemble, fun(_, _, _) -> {async, spawn_link(SleepForever)} end), + + FinTopic = <<"$file/fakeid/fin/999999">>, + + ?assertMatch( + {ok, #{reason_code_name := unspecified_error}}, + emqtt:publish(C, FinTopic, <<>>, 1) + ), + + meck:unload(emqx_ft_storage), + emqtt:stop(C). + +t_cleanup_by_cm(_Config) -> + process_flag(trap_exit, true), + ClientId = atom_to_binary(?FUNCTION_NAME), + C = emqx_ft_test_helpers:start_client(ClientId, node()), + + ok = meck:new(emqx_ft_storage, [passthrough]), + meck:expect(emqx_ft_storage, kickoff, fun(_) -> meck:exception(error, oops) end), + + FinTopic = <<"$file/fakeid/fin/999999">>, + + [ClientPid] = emqx_cm:lookup_channels(ClientId), + + ?assertWaitEvent( + begin + emqtt:publish(C, FinTopic, <<>>, 1), + exit(ClientPid, kill) + end, + #{?snk_kind := emqx_cm_clean_down, client_id := ClientId}, + 1000 + ), + + ?assertEqual( + {0, 0}, + emqx_ft_async_reply:info() + ), + + meck:unload(emqx_ft_storage). + +t_unrelated_events(_Config) -> + process_flag(trap_exit, true), + ClientId = atom_to_binary(?FUNCTION_NAME), + C = emqx_ft_test_helpers:start_client(ClientId, node()), + [ClientPid] = emqx_cm:lookup_channels(ClientId), + + erlang:monitor(process, ClientPid), + + ClientPid ! {'DOWN', make_ref(), process, self(), normal}, + ClientPid ! {timeout, make_ref(), unknown_timer_event}, + + ?assertNotReceive( + {'DOWN', _Ref, process, ClientPid, _Reason}, + 500 + ), + + emqtt:stop(C). From 39a48179ea29d672e4f3e5864e850b3019bb14cb Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Wed, 23 Aug 2023 12:09:40 +0300 Subject: [PATCH 44/85] chore(emqx_channel): use macros for reply construction --- apps/emqx/include/emqx_channel.hrl | 5 +++++ apps/emqx/src/emqx_channel.erl | 13 +++++++------ apps/emqx_ft/src/emqx_ft.erl | 6 ++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/apps/emqx/include/emqx_channel.hrl b/apps/emqx/include/emqx_channel.hrl index be2448a20..53abcafd6 100644 --- a/apps/emqx/include/emqx_channel.hrl +++ b/apps/emqx/include/emqx_channel.hrl @@ -41,4 +41,9 @@ will_msg ]). +-define(REPLY_OUTGOING(Packets), {outgoing, Packets}). +-define(REPLY_CONNACK(Packet), {connack, Packet}). +-define(REPLY_EVENT(StateOrEvent), {event, StateOrEvent}). +-define(REPLY_CLOSE(Reason), {close, Reason}). + -define(EXPIRE_INTERVAL_INFINITE, 4294967295000). diff --git a/apps/emqx/src/emqx_channel.erl b/apps/emqx/src/emqx_channel.erl index 8d0a58767..53cac2400 100644 --- a/apps/emqx/src/emqx_channel.erl +++ b/apps/emqx/src/emqx_channel.erl @@ -122,6 +122,7 @@ -type reply() :: {outgoing, emqx_types:packet()} | {outgoing, [emqx_types:packet()]} + | {connack, emqx_types:packet()} | {event, conn_state() | updated} | {close, Reason :: atom()}. @@ -1023,7 +1024,7 @@ handle_out(publish, [], Channel) -> {ok, Channel}; handle_out(publish, Publishes, Channel) -> {Packets, NChannel} = do_deliver(Publishes, Channel), - {ok, {outgoing, Packets}, NChannel}; + {ok, ?REPLY_OUTGOING(Packets), NChannel}; handle_out(puback, {PacketId, ReasonCode}, Channel) -> {ok, ?PUBACK_PACKET(PacketId, ReasonCode), Channel}; handle_out(pubrec, {PacketId, ReasonCode}, Channel) -> @@ -1048,7 +1049,7 @@ handle_out(disconnect, {ReasonCode, ReasonName}, Channel) -> handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel); handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) -> Packet = ?DISCONNECT_PACKET(ReasonCode, Props), - {ok, [{outgoing, Packet}, {close, ReasonName}], Channel}; + {ok, [?REPLY_OUTGOING(Packet), {close, ReasonName}], Channel}; handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) -> {ok, {close, ReasonName}, Channel}; handle_out(auth, {ReasonCode, Properties}, Channel) -> @@ -1062,7 +1063,7 @@ handle_out(Type, Data, Channel) -> %%-------------------------------------------------------------------- return_connack(AckPacket, Channel) -> - Replies = [{event, connected}, {connack, AckPacket}], + Replies = [?REPLY_EVENT(connected), ?REPLY_CONNACK(AckPacket)], case maybe_resume_session(Channel) of ignore -> {ok, Replies, Channel}; @@ -1073,7 +1074,7 @@ return_connack(AckPacket, Channel) -> session = NSession }, {Packets, NChannel2} = do_deliver(Publishes, NChannel1), - Outgoing = [{outgoing, Packets} || length(Packets) > 0], + Outgoing = [?REPLY_OUTGOING(Packets) || length(Packets) > 0], {ok, Replies ++ Outgoing, NChannel2} end. @@ -1121,7 +1122,7 @@ do_deliver(Publishes, Channel) when is_list(Publishes) -> %%-------------------------------------------------------------------- return_sub_unsub_ack(Packet, Channel) -> - {ok, [{outgoing, Packet}, {event, updated}], Channel}. + {ok, [?REPLY_OUTGOING(Packet), ?REPLY_EVENT(updated)], Channel}. %%-------------------------------------------------------------------- %% Handle call @@ -1235,7 +1236,7 @@ handle_info( -> Channel1 = ensure_disconnected(Reason, maybe_publish_will_msg(Channel)), case maybe_shutdown(Reason, Channel1) of - {ok, Channel2} -> {ok, {event, disconnected}, Channel2}; + {ok, Channel2} -> {ok, ?REPLY_EVENT(disconnected), Channel2}; Shutdown -> Shutdown end; handle_info({sock_closed, Reason}, Channel = #channel{conn_state = disconnected}) -> diff --git a/apps/emqx_ft/src/emqx_ft.erl b/apps/emqx_ft/src/emqx_ft.erl index 521d5b10a..6a98c51f0 100644 --- a/apps/emqx_ft/src/emqx_ft.erl +++ b/apps/emqx_ft/src/emqx_ft.erl @@ -18,7 +18,9 @@ -include_lib("emqx/include/emqx.hrl"). -include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/emqx_channel.hrl"). -include_lib("emqx/include/emqx_hooks.hrl"). + -include_lib("snabbkaffe/include/trace.hrl"). -export([ @@ -164,7 +166,7 @@ on_channel_unregistered(ChannelPid) -> on_client_timeout(_TRef, ?FT_EVENT({MRef, PacketId}), Acc) -> _ = erlang:demonitor(MRef, [flush]), _ = emqx_ft_async_reply:take_by_mref(MRef), - {ok, [{outgoing, ?PUBACK_PACKET(PacketId, ?RC_UNSPECIFIED_ERROR)} | Acc]}; + {ok, [?REPLY_OUTGOING(?PUBACK_PACKET(PacketId, ?RC_UNSPECIFIED_ERROR)) | Acc]}; on_client_timeout(_TRef, _Event, Acc) -> {ok, Acc}. @@ -172,7 +174,7 @@ on_process_down(MRef, _Pid, Reason, Acc) -> case emqx_ft_async_reply:take_by_mref(MRef) of {ok, PacketId, TRef} -> _ = emqx_utils:cancel_timer(TRef), - {ok, [{outgoing, ?PUBACK_PACKET(PacketId, reason_to_rc(Reason))} | Acc]}; + {ok, [?REPLY_OUTGOING(?PUBACK_PACKET(PacketId, reason_to_rc(Reason))) | Acc]}; not_found -> {ok, Acc} end. From 4488e9e591592a27dd95797ba43497d0cf2d5b12 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Wed, 23 Aug 2023 12:13:13 +0300 Subject: [PATCH 45/85] chore(ft): stop hook chain when doing actual handling --- apps/emqx_ft/src/emqx_ft.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/emqx_ft/src/emqx_ft.erl b/apps/emqx_ft/src/emqx_ft.erl index 6a98c51f0..41020e76f 100644 --- a/apps/emqx_ft/src/emqx_ft.erl +++ b/apps/emqx_ft/src/emqx_ft.erl @@ -166,7 +166,7 @@ on_channel_unregistered(ChannelPid) -> on_client_timeout(_TRef, ?FT_EVENT({MRef, PacketId}), Acc) -> _ = erlang:demonitor(MRef, [flush]), _ = emqx_ft_async_reply:take_by_mref(MRef), - {ok, [?REPLY_OUTGOING(?PUBACK_PACKET(PacketId, ?RC_UNSPECIFIED_ERROR)) | Acc]}; + {stop, [?REPLY_OUTGOING(?PUBACK_PACKET(PacketId, ?RC_UNSPECIFIED_ERROR)) | Acc]}; on_client_timeout(_TRef, _Event, Acc) -> {ok, Acc}. @@ -174,7 +174,7 @@ on_process_down(MRef, _Pid, Reason, Acc) -> case emqx_ft_async_reply:take_by_mref(MRef) of {ok, PacketId, TRef} -> _ = emqx_utils:cancel_timer(TRef), - {ok, [?REPLY_OUTGOING(?PUBACK_PACKET(PacketId, reason_to_rc(Reason))) | Acc]}; + {stop, [?REPLY_OUTGOING(?PUBACK_PACKET(PacketId, reason_to_rc(Reason))) | Acc]}; not_found -> {ok, Acc} end. From 9bded078345048bc460983af6f85cbdd19751445 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 14 Aug 2023 17:04:53 -0300 Subject: [PATCH 46/85] feat: cluster purge Fixes https://emqx.atlassian.net/browse/EMQX-10763 --- apps/emqx/priv/bpapi.versions | 4 + apps/emqx/src/emqx_cm.erl | 21 ++ .../src/emqx_eviction_agent.app.src | 2 +- .../src/emqx_eviction_agent.erl | 29 ++ .../test/emqx_eviction_agent_test_helpers.erl | 6 +- .../src/emqx_node_rebalance.app.src | 2 +- .../src/emqx_node_rebalance.erl | 4 +- .../src/emqx_node_rebalance_agent.erl | 24 +- .../src/emqx_node_rebalance_api.erl | 131 ++++++- .../src/emqx_node_rebalance_cli.erl | 79 +++- .../src/emqx_node_rebalance_purge.erl | 233 ++++++++++++ .../src/emqx_node_rebalance_status.erl | 63 +++- .../src/emqx_node_rebalance_sup.erl | 1 + .../emqx_node_rebalance_api_proto_v2.erl | 59 +++ .../proto/emqx_node_rebalance_proto_v2.erl | 84 +++++ .../emqx_node_rebalance_purge_proto_v1.erl | 29 ++ .../emqx_node_rebalance_status_proto_v2.erl | 46 +++ .../test/emqx_node_rebalance_api_SUITE.erl | 152 ++++++-- .../test/emqx_node_rebalance_cli_SUITE.erl | 90 +++++ .../test/emqx_node_rebalance_purge_SUITE.erl | 353 ++++++++++++++++++ .../test/emqx_node_rebalance_status_SUITE.erl | 2 +- changes/ee/feat-11447.en.md | 1 + rel/i18n/emqx_node_rebalance_api.hocon | 18 + 23 files changed, 1375 insertions(+), 58 deletions(-) create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl create mode 100644 apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v2.erl create mode 100644 apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v2.erl create mode 100644 apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_purge_proto_v1.erl create mode 100644 apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v2.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl create mode 100644 changes/ee/feat-11447.en.md diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index 68d42ee01..cb9c3ad82 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -37,9 +37,13 @@ {emqx_mgmt_trace,1}. {emqx_mgmt_trace,2}. {emqx_node_rebalance,1}. +{emqx_node_rebalance,2}. {emqx_node_rebalance_api,1}. +{emqx_node_rebalance_api,2}. {emqx_node_rebalance_evacuation,1}. +{emqx_node_rebalance_purge,1}. {emqx_node_rebalance_status,1}. +{emqx_node_rebalance_status,2}. {emqx_persistent_session,1}. {emqx_persistent_session_ds,1}. {emqx_plugins,1}. diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index 2cc2b72b4..d543b98d7 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -78,6 +78,7 @@ %% Client management -export([ + all_channels_table/1, channel_with_session_table/1, live_connection_table/1 ]). @@ -593,6 +594,26 @@ channel_with_session_table(ConnModuleList) -> sets:is_element(ConnModule, ConnModules) ]). +%% @doc Get clientinfo for all clients, regardless if they use clean start or not. +all_channels_table(ConnModuleList) -> + Ms = ets:fun2ms( + fun({{ClientId, _ChanPid}, Info, _Stats}) -> + {ClientId, Info} + end + ), + Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]), + ConnModules = sets:from_list(ConnModuleList, [{version, 2}]), + qlc:q([ + {ClientId, ConnState, ConnInfo, ClientInfo} + || {ClientId, #{ + conn_state := ConnState, + clientinfo := ClientInfo, + conninfo := #{conn_mod := ConnModule} = ConnInfo + }} <- + Table, + sets:is_element(ConnModule, ConnModules) + ]). + %% @doc Get all local connection query handle live_connection_table(ConnModules) -> Ms = lists:map(fun live_connection_ms/1, ConnModules), diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src index f9f6334c3..c11f52fe7 100644 --- a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src @@ -1,6 +1,6 @@ {application, emqx_eviction_agent, [ {description, "EMQX Eviction Agent"}, - {vsn, "5.1.0"}, + {vsn, "5.1.1"}, {registered, [ emqx_eviction_agent_sup, emqx_eviction_agent, diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl index 9a29adc69..7376c8069 100644 --- a/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl @@ -23,6 +23,7 @@ evict_connections/1, evict_sessions/2, evict_sessions/3, + purge_sessions/1, evict_session_channel/3 ]). @@ -113,6 +114,14 @@ evict_sessions(N, Nodes, ConnState) when {error, disabled} end. +purge_sessions(N) -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + ok = do_purge_sessions(N); + disabled -> + {error, disabled} + end. + %%-------------------------------------------------------------------- %% gen_server callbacks %%-------------------------------------------------------------------- @@ -247,6 +256,17 @@ take_connections(N) -> ok = qlc:delete_cursor(ChanPidCursor), ChanPids. +take_channels(N) -> + QH = qlc:q([ + {ClientId, ConnInfo, ClientInfo} + || {ClientId, _, ConnInfo, ClientInfo} <- + emqx_cm:all_channels_table(?CONN_MODULES) + ]), + ChanPidCursor = qlc:cursor(QH), + Channels = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + Channels. + take_channel_with_sessions(N, ConnState) -> ChanPidCursor = qlc:cursor(channel_with_session_table(ConnState)), Channels = qlc:next_answers(ChanPidCursor, N), @@ -344,5 +364,14 @@ disconnect_channel(ChanPid, ServerReference) -> 'Server-Reference' => ServerReference }}. +do_purge_sessions(N) when N > 0 -> + Channels = take_channels(N), + ok = lists:foreach( + fun({ClientId, _ConnInfo, _ClientInfo}) -> + emqx_cm:discard_session(ClientId) + end, + Channels + ). + select_random(List) when length(List) > 0 -> lists:nth(rand:uniform(length(List)), List). diff --git a/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl index 7425cb145..130a2628a 100644 --- a/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl +++ b/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl @@ -9,6 +9,7 @@ emqtt_connect/1, emqtt_connect/2, emqtt_connect_many/2, + emqtt_connect_many/3, stop_many/1, emqtt_try_connect/1, @@ -42,6 +43,9 @@ emqtt_connect(Opts) -> end. emqtt_connect_many(Port, Count) -> + emqtt_connect_many(Port, Count, _StartN = 1). + +emqtt_connect_many(Port, Count, StartN) -> lists:map( fun(N) -> NBin = integer_to_binary(N), @@ -49,7 +53,7 @@ emqtt_connect_many(Port, Count) -> {ok, C} = emqtt_connect([{clientid, ClientId}, {clean_start, false}, {port, Port}]), C end, - lists:seq(1, Count) + lists:seq(StartN, StartN + Count - 1) ). stop_many(Clients) -> diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src index edfa6574e..f6b619d39 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src @@ -1,6 +1,6 @@ {application, emqx_node_rebalance, [ {description, "EMQX Node Rebalance"}, - {vsn, "5.0.4"}, + {vsn, "5.0.5"}, {registered, [ emqx_node_rebalance_sup, emqx_node_rebalance, diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl index 9d53841ed..b2044c5fa 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl @@ -81,7 +81,7 @@ start_link() -> -spec available_nodes(list(node())) -> list(node()). available_nodes(Nodes) when is_list(Nodes) -> - {Available, _} = emqx_node_rebalance_proto_v1:available_nodes(Nodes), + {Available, _} = emqx_node_rebalance_proto_v2:available_nodes(Nodes), lists:filter(fun is_atom/1, Available). %%-------------------------------------------------------------------- @@ -370,7 +370,7 @@ avg(List) when length(List) >= 1 -> lists:sum(List) / length(List). multicall(Nodes, F, A) -> - case apply(emqx_node_rebalance_proto_v1, F, [Nodes | A]) of + case apply(emqx_node_rebalance_proto_v2, F, [Nodes | A]) of {Results, []} -> case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of {OkResults, []} -> diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl index 47708d00e..250d03d9c 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl @@ -14,7 +14,9 @@ -export([ start_link/0, enable/1, + enable/2, disable/1, + disable/2, status/0 ]). @@ -40,11 +42,21 @@ start_link() -> -spec enable(pid()) -> ok_or_error(already_enabled | eviction_agent_busy). enable(CoordinatorPid) -> - gen_server:call(?MODULE, {enable, CoordinatorPid}). + enable(CoordinatorPid, ?ENABLE_KIND). + +-spec enable(pid(), emqx_eviction_agent:kind()) -> + ok_or_error(already_enabled | eviction_agent_busy). +enable(CoordinatorPid, Kind) -> + gen_server:call(?MODULE, {enable, CoordinatorPid, Kind}). -spec disable(pid()) -> ok_or_error(already_disabled | invalid_coordinator). disable(CoordinatorPid) -> - gen_server:call(?MODULE, {disable, CoordinatorPid}). + disable(CoordinatorPid, ?ENABLE_KIND). + +-spec disable(pid(), emqx_eviction_agent:kind()) -> + ok_or_error(already_disabled | invalid_coordinator). +disable(CoordinatorPid, Kind) -> + gen_server:call(?MODULE, {disable, CoordinatorPid, Kind}). -spec status() -> status(). status() -> @@ -57,7 +69,7 @@ status() -> init([]) -> {ok, #{}}. -handle_call({enable, CoordinatorPid}, _From, St) -> +handle_call({enable, CoordinatorPid, Kind}, _From, St) -> case St of #{coordinator_pid := _Pid} -> {reply, {error, already_enabled}, St}; @@ -65,7 +77,7 @@ handle_call({enable, CoordinatorPid}, _From, St) -> true = link(CoordinatorPid), EvictionAgentPid = whereis(emqx_eviction_agent), true = link(EvictionAgentPid), - case emqx_eviction_agent:enable(?ENABLE_KIND, undefined) of + case emqx_eviction_agent:enable(Kind, undefined) of ok -> {reply, ok, #{ coordinator_pid => CoordinatorPid, @@ -77,13 +89,13 @@ handle_call({enable, CoordinatorPid}, _From, St) -> {reply, {error, eviction_agent_busy}, St} end end; -handle_call({disable, CoordinatorPid}, _From, St) -> +handle_call({disable, CoordinatorPid, Kind}, _From, St) -> case St of #{ coordinator_pid := CoordinatorPid, eviction_agent_pid := EvictionAgentPid } -> - _ = emqx_eviction_agent:disable(?ENABLE_KIND), + _ = emqx_eviction_agent:disable(Kind), true = unlink(EvictionAgentPid), true = unlink(CoordinatorPid), NewSt = maps:without( diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl index 430ad1e34..ddeebf188 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -31,7 +31,9 @@ '/load_rebalance/:node/start'/2, '/load_rebalance/:node/stop'/2, '/load_rebalance/:node/evacuation/start'/2, - '/load_rebalance/:node/evacuation/stop'/2 + '/load_rebalance/:node/evacuation/stop'/2, + '/load_rebalance/:node/purge/start'/2, + '/load_rebalance/:node/purge/stop'/2 ]). %% Schema examples @@ -66,7 +68,9 @@ paths() -> "/load_rebalance/:node/start", "/load_rebalance/:node/stop", "/load_rebalance/:node/evacuation/start", - "/load_rebalance/:node/evacuation/stop" + "/load_rebalance/:node/evacuation/stop", + "/load_rebalance/:node/purge/start", + "/load_rebalance/:node/purge/stop" ]. schema("/load_rebalance/status") -> @@ -175,6 +179,41 @@ schema("/load_rebalance/:node/evacuation/stop") -> 404 => error_codes([?NOT_FOUND], <<"Not Found">>) } } + }; +schema("/load_rebalance/:node/purge/start") -> + #{ + 'operationId' => '/load_rebalance/:node/purge/start', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Start purge on the whole cluster">>, + description => ?DESC("cluster_purge_start"), + parameters => [param_node()], + 'requestBody' => + emqx_dashboard_swagger:schema_with_examples( + ref(purge_start), + purge_example() + ), + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), + 404 => error_codes([?NOT_FOUND], <<"Not Found">>) + } + } + }; +schema("/load_rebalance/:node/purge/stop") -> + #{ + 'operationId' => '/load_rebalance/:node/purge/stop', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Stop purge on the whole cluster">>, + description => ?DESC("cluster_purge_stop"), + parameters => [param_node()], + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), + 404 => error_codes([?NOT_FOUND], <<"Not Found">>) + } + } }. %%-------------------------------------------------------------------- @@ -188,16 +227,20 @@ schema("/load_rebalance/:node/evacuation/stop") -> {rebalance, Stats} -> {200, format_status(rebalance, Stats)}; {evacuation, Stats} -> - {200, format_status(evacuation, Stats)} + {200, format_status(evacuation, Stats)}; + {purge, Stats} -> + {200, format_status(purge, Stats)} end. '/load_rebalance/global_status'(get, #{}) -> #{ evacuations := Evacuations, + purges := Purges, rebalances := Rebalances } = emqx_node_rebalance_status:global_status(), {200, #{ evacuations => format_as_map_list(Evacuations), + purges => format_as_map_list(Purges), rebalances => format_as_map_list(Rebalances) }}. @@ -214,7 +257,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> Params1 = translate(rebalance_start, Params0), with_nodes_at_key(nodes, Params1, fun(Params2) -> wrap_rpc( - Node, emqx_node_rebalance_api_proto_v1:node_rebalance_start(Node, Params2) + Node, emqx_node_rebalance_api_proto_v2:node_rebalance_start(Node, Params2) ) end) end). @@ -222,7 +265,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> '/load_rebalance/:node/stop'(post, #{bindings := #{node := NodeBin}}) -> emqx_utils_api:with_node(NodeBin, fun(Node) -> wrap_rpc( - Node, emqx_node_rebalance_api_proto_v1:node_rebalance_stop(Node) + Node, emqx_node_rebalance_api_proto_v2:node_rebalance_stop(Node) ) end). @@ -234,7 +277,7 @@ schema("/load_rebalance/:node/evacuation/stop") -> with_nodes_at_key(migrate_to, Params1, fun(Params2) -> wrap_rpc( Node, - emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_start( + emqx_node_rebalance_api_proto_v2:node_rebalance_evacuation_start( Node, Params2 ) ) @@ -244,7 +287,27 @@ schema("/load_rebalance/:node/evacuation/stop") -> '/load_rebalance/:node/evacuation/stop'(post, #{bindings := #{node := NodeBin}}) -> emqx_utils_api:with_node(NodeBin, fun(Node) -> wrap_rpc( - Node, emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_stop(Node) + Node, emqx_node_rebalance_api_proto_v2:node_rebalance_evacuation_stop(Node) + ) + end). + +'/load_rebalance/:node/purge/start'(post, #{ + bindings := #{node := NodeBin}, body := Params0 +}) -> + emqx_utils_api:with_node(NodeBin, fun(Node) -> + Params1 = translate(purge_start, Params0), + wrap_rpc( + Node, + emqx_node_rebalance_api_proto_v2:node_rebalance_purge_start( + Node, Params1 + ) + ) + end). + +'/load_rebalance/:node/purge/stop'(post, #{bindings := #{node := NodeBin}}) -> + emqx_utils_api:with_node(NodeBin, fun(Node) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v2:node_rebalance_purge_stop(Node) ) end). @@ -483,6 +546,17 @@ fields(rebalance_evacuation_start) -> } )} ]; +fields(purge_start) -> + [ + {"purge_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(purge_rate), + required => false + } + )} + ]; fields(local_status_disabled) -> [ {"status", @@ -687,6 +761,38 @@ fields(global_evacuation_status) -> } )} ]; +fields(global_purge_status) -> + without( + [ + "status", + "process", + "connection_eviction_rate", + "session_eviction_rate", + "connection_goal", + "disconnected_session_goal", + "session_recipients", + "recipients" + ], + fields(local_status_enabled) + ) ++ + [ + {"purge_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(local_status_purge_rate), + required => false + } + )}, + {"node", + mk( + binary(), + #{ + desc => ?DESC(evacuation_status_node), + required => true + } + )} + ]; fields(global_status) -> [ {"evacuations", @@ -697,6 +803,14 @@ fields(global_status) -> required => false } )}, + {"purges", + mk( + hoconsc:array(ref(global_purge_status)), + #{ + desc => ?DESC(global_status_purges), + required => false + } + )}, {"rebalances", mk( hoconsc:array(ref(global_coordinator_status)), @@ -735,6 +849,9 @@ rebalance_evacuation_example() -> } }. +purge_example() -> + #{purge => #{purge_rate => 100}}. + local_status_response_schema() -> hoconsc:union([ref(local_status_disabled), ref(local_status_enabled)]). diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl index 66f7a1789..9e591eb5c 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl @@ -29,6 +29,15 @@ cli(["start" | StartArgs]) -> emqx_ctl:print("Rebalance is already enabled~n"), false end; + {purge, Opts} -> + case emqx_node_rebalance_purge:start(Opts) of + ok -> + emqx_ctl:print("Rebalance(purge) started~n"), + true; + {error, Reason} -> + emqx_ctl:print("Rebalance(purge) start error: ~p~n", [Reason]), + false + end; {rebalance, Opts} -> case emqx_node_rebalance:start(Opts) of ok -> @@ -55,6 +64,7 @@ cli(["node-status"]) -> cli(["status"]) -> #{ evacuations := Evacuations, + purges := Purges, rebalances := Rebalances } = emqx_node_rebalance_status:global_status(), lists:foreach( @@ -69,6 +79,18 @@ cli(["status"]) -> end, Evacuations ), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print( + "--------------------------------------------------------------------~n" + ), + emqx_ctl:print( + "Node ~p: purge~n~s", + [Node, emqx_node_rebalance_status:format_local_status(Status)] + ) + end, + Purges + ), lists:foreach( fun({Node, Status}) -> emqx_ctl:print( @@ -82,10 +104,14 @@ cli(["status"]) -> Rebalances ); cli(["stop"]) -> - case emqx_node_rebalance_evacuation:status() of - {enabled, _} -> - ok = emqx_node_rebalance_evacuation:stop(), - emqx_ctl:print("Rebalance(evacuation) stopped~n"), + Checks = + [ + {evacuation, fun emqx_node_rebalance_evacuation:status/0, + fun emqx_node_rebalance_evacuation:stop/0}, + {purge, fun emqx_node_rebalance_purge:status/0, fun emqx_node_rebalance_purge:stop/0} + ], + case do_stop(Checks) of + ok -> true; disabled -> case emqx_node_rebalance:status() of @@ -112,6 +138,12 @@ cli(_) -> "Start current node evacuation with optional server redirect to the specified servers" }, + { + "rebalance start --purge \\\n" + " [--purge-rate CountPerSec]", + "Start purge on all running nodes in the cluster" + }, + { "rebalance start \\\n" " [--nodes \"node1@host1 node2@host2\"] \\\n" @@ -140,7 +172,11 @@ cli(_) -> node_status(NodeStatus) -> case NodeStatus of - {Process, Status} when Process =:= evacuation orelse Process =:= rebalance -> + {Process, Status} when + Process =:= evacuation; + Process =:= purge; + Process =:= rebalance + -> emqx_ctl:print( "Rebalance type: ~p~n~s~n", [Process, emqx_node_rebalance_status:format_local_status(Status)] @@ -160,6 +196,13 @@ start_args(Args) -> {error, _} = Error -> Error end; + {ok, #{"--purge" := true} = Collected} -> + case validate_purge(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {purge, Validated}; + {error, _} = Error -> + Error + end; {ok, #{} = Collected} -> case validate_rebalance(maps:to_list(Collected), #{}) of {ok, Validated} -> @@ -180,6 +223,11 @@ collect_args(["--redirect-to", ServerReference | Args], Map) -> collect_args(Args, Map#{"--redirect-to" => ServerReference}); collect_args(["--migrate-to", MigrateTo | Args], Map) -> collect_args(Args, Map#{"--migrate-to" => MigrateTo}); +%% purge +collect_args(["--purge" | Args], Map) -> + collect_args(Args, Map#{"--purge" => true}); +collect_args(["--purge-rate", PurgeRate | Args], Map) -> + collect_args(Args, Map#{"--purge-rate" => PurgeRate}); %% rebalance collect_args(["--nodes", Nodes | Args], Map) -> collect_args(Args, Map#{"--nodes" => Nodes}); @@ -239,6 +287,15 @@ validate_evacuation([{"--migrate-to", MigrateTo} | Rest], Map) -> validate_evacuation(Rest, _Map) -> {error, io_lib:format("unknown evacuation arguments: ~p", [Rest])}. +validate_purge([], Map) -> + {ok, Map}; +validate_purge([{"--purge", _} | Rest], Map) -> + validate_purge(Rest, Map); +validate_purge([{"--purge-rate", _} | _] = Opts, Map) -> + validate_pos_int(purge_rate, Opts, Map, fun validate_purge/2); +validate_purge(Rest, _Map) -> + {error, io_lib:format("unknown purge arguments: ~p", [Rest])}. + validate_rebalance([], Map) -> {ok, Map}; validate_rebalance([{"--wait-health-check", _} | _] = Opts, Map) -> @@ -306,3 +363,15 @@ strings_to_atoms([Str | Rest], Atoms, Invalid) -> {error, _} -> strings_to_atoms(Rest, Atoms, [Str | Invalid]) end. + +do_stop([{Type, Check, Stop} | Rest]) -> + case Check() of + {enabled, _} -> + ok = Stop(), + emqx_ctl:print("Rebalance(~s) stopped~n", [Type]), + ok; + disabled -> + do_stop(Rest) + end; +do_stop([]) -> + disabled. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl new file mode 100644 index 000000000..71820266c --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl @@ -0,0 +1,233 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_purge). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start/1, + status/0, + stop/0 +]). + +-export([start_link/0]). + +-behaviour(gen_statem). + +-export([ + init/1, + callback_mode/0, + handle_event/4, + code_change/4 +]). + +-export_type([ + start_opts/0, + start_error/0, + stop_error/0 +]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-define(DEFAULT_PURGE_RATE, 500). +-define(ENABLE_KIND, purge). + +%% gen_statem states +-define(disabled, disabled). +-define(purging, purging). +-define(cleaning_retained_messages, cleaning_retained_messages). + +-type start_opts() :: #{ + purge_rate => pos_integer() +}. +-type start_error() :: already_started. +-type stop_error() :: not_started. +-type stats() :: #{ + initial_sessions := non_neg_integer(), + current_sessions := non_neg_integer(), + purge_rate := pos_integer() +}. +-type status() :: {enabled, stats()} | disabled. + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> status(). +status() -> + gen_statem:call(?MODULE, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, purging, cleaning_retained_messages + +init([]) -> + {ok, disabled, #{}}. + +%% start +handle_event({call, From}, {start, Opts}, ?disabled, #{} = Data) -> + ok = enable_purge(), + ?SLOG(warning, #{ + msg => "cluster_purge_started", + opts => Opts + }), + NewData = init_data(Data, Opts), + {next_state, ?purging, NewData, [ + {state_timeout, 0, purge}, + {reply, From, ok} + ]}; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, [{reply, From, {error, already_started}}]}; +%% stop +handle_event({call, From}, stop, ?disabled, #{}) -> + {keep_state_and_data, [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = disable_purge(), + ?SLOG(warning, #{msg => "cluster_purge_stopped"}), + {next_state, disabled, deinit(Data), [{reply, From, ok}]}; +%% status +handle_event({call, From}, status, ?disabled, #{}) -> + {keep_state_and_data, [{reply, From, disabled}]}; +handle_event({call, From}, status, State, Data) -> + Stats = maps:with( + [ + initial_sessions, + current_sessions, + purge_rate + ], + Data + ), + {keep_state_and_data, [ + {reply, From, {enabled, Stats#{state => State}}} + ]}; +%% session purge +handle_event( + state_timeout, + purge, + ?purging, + #{ + purge_rate := PurgeRate + } = Data +) -> + case emqx_eviction_agent:status() of + {enabled, #{sessions := Sessions}} when Sessions > 0 -> + ok = purge_sessions(PurgeRate), + ?tp(debug, cluster_purge_evict_session, #{purge_rate => PurgeRate}), + ?SLOG( + warning, + #{ + msg => "cluster_purge_evict_sessions", + count => Sessions, + purge_rate => PurgeRate + } + ), + NewData = Data#{current_sessions => Sessions}, + {keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, purge}]}; + {enabled, #{sessions := 0}} -> + NewData = Data#{current_conns => 0}, + ?SLOG(warning, #{msg => "cluster_purge_evict_sessions_done"}), + {next_state, ?cleaning_retained_messages, NewData, [ + {state_timeout, 0, clean_retained_messages} + ]} + end; +handle_event( + state_timeout, + clean_retained_messages, + ?cleaning_retained_messages, + Data +) -> + ?SLOG(warning, #{msg => "cluster_purge_cleaning_retained_messages"}), + ok = emqx_retainer:clean(), + ?tp(warning, "cluster_purge_done", #{}), + ok = disable_purge(), + ?tp(warning, "cluster_purge_finished_successfully", #{}), + {next_state, ?disabled, deinit(Data)}; +handle_event({call, From}, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_call", call => Msg, state => State, data => Data}), + {keep_state_and_data, [{reply, From, ignored}]}; +handle_event(info, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_info", info => Msg, state => State, data => Data}), + keep_state_and_data; +handle_event(cast, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => State, data => Data}), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +default_opts() -> + #{ + purge_rate => ?DEFAULT_PURGE_RATE + }. + +init_data(Data0, Opts) -> + Data1 = maps:merge(Data0, Opts), + SessCount = emqx_eviction_agent:session_count(), + Data1#{ + initial_sessions => SessCount, + current_sessions => SessCount + }. + +deinit(Data) -> + Keys = + [initial_sessions, current_sessions | maps:keys(default_opts())], + maps:without(Keys, Data). + +multicall(Nodes, F, A) -> + case apply(emqx_node_rebalance_proto_v2, F, [Nodes | A]) of + {Results, []} -> + case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of + {_OkResults, []} -> + ok; + {_, BadResults} -> + %% we crash on errors so that the coordinator death is signalled to + %% the eviction agents in the cluster. + error({bad_nodes, BadResults}) + end; + {_, [_BadNode | _] = BadNodes} -> + error({bad_nodes, BadNodes}) + end. + +is_ok({_Node, {ok, _}}) -> true; +is_ok({_Node, ok}) -> true; +is_ok(_) -> false. + +enable_purge() -> + Nodes = emqx:running_nodes(), + _ = multicall(Nodes, enable_rebalance_agent, [self(), ?ENABLE_KIND]), + ok. + +disable_purge() -> + Nodes = emqx:running_nodes(), + _ = multicall(Nodes, disable_rebalance_agent, [self(), ?ENABLE_KIND]), + ok. + +purge_sessions(PurgeRate) -> + Nodes = emqx:running_nodes(), + _ = multicall(Nodes, purge_sessions, [PurgeRate]), + ok. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl index a0102c4f4..dbeb4d97f 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl @@ -15,6 +15,7 @@ %% For RPC -export([ evacuation_status/0, + purge_status/0, rebalance_status/0 ]). @@ -22,11 +23,13 @@ %% APIs %%-------------------------------------------------------------------- --spec local_status() -> disabled | {evacuation, map()} | {rebalance, map()}. +-spec local_status() -> disabled | {evacuation, map()} | {purge, map()} | {rebalance, map()}. local_status() -> - case emqx_node_rebalance_evacuation:status() of - {enabled, Status} -> - {evacuation, evacuation(Status)}; + Checks = [ + {evacuation, fun emqx_node_rebalance_evacuation:status/0, fun evacuation/1}, + {purge, fun emqx_node_rebalance_purge:status/0, fun purge/1} + ], + case do_local_status(Checks) of disabled -> case emqx_node_rebalance_agent:status() of {enabled, CoordinatorPid} -> @@ -38,28 +41,37 @@ local_status() -> end; disabled -> disabled - end + end; + Res -> + Res end. --spec local_status(node()) -> disabled | {evacuation, map()} | {rebalance, map()}. +-spec local_status(node()) -> disabled | {evacuation, map()} | {purge, map()} | {rebalance, map()}. local_status(Node) -> - emqx_node_rebalance_status_proto_v1:local_status(Node). + emqx_node_rebalance_status_proto_v2:local_status(Node). -spec format_local_status(map()) -> iodata(). format_local_status(Status) -> format_status(Status, local_status_field_format_order()). --spec global_status() -> #{rebalances := [{node(), map()}], evacuations := [{node(), map()}]}. +-spec global_status() -> + #{ + rebalances := [{node(), map()}], + evacuations := [{node(), map()}], + purges := [{node(), map()}] + }. global_status() -> Nodes = emqx:running_nodes(), - {RebalanceResults, _} = emqx_node_rebalance_status_proto_v1:rebalance_status(Nodes), + {RebalanceResults, _} = emqx_node_rebalance_status_proto_v2:rebalance_status(Nodes), Rebalances = [ {Node, coordinator_rebalance(Status)} || {Node, {enabled, Status}} <- RebalanceResults ], - {EvacuatioResults, _} = emqx_node_rebalance_status_proto_v1:evacuation_status(Nodes), - Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuatioResults], - #{rebalances => Rebalances, evacuations => Evacuations}. + {EvacuationResults, _} = emqx_node_rebalance_status_proto_v2:evacuation_status(Nodes), + Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuationResults], + {PurgeResults, _} = emqx_node_rebalance_status_proto_v2:purge_status(Nodes), + Purges = [{Node, purge(Status)} || {Node, {enabled, Status}} <- PurgeResults], + #{rebalances => Rebalances, evacuations => Evacuations, purges => Purges}. -spec format_coordinator_status(map()) -> iodata(). format_coordinator_status(Status) -> @@ -85,6 +97,17 @@ evacuation(Status) -> } }. +purge(Status) -> + #{ + state => maps:get(state, Status), + purge_rate => maps:get(purge_rate, Status), + session_goal => 0, + stats => #{ + initial_sessions => maps:get(initial_sessions, Status), + current_sessions => maps:get(current_sessions, Status) + } + }. + local_rebalance(#{donors := Donors} = Stats, Node) -> case lists:member(Node, Donors) of true -> {rebalance, donor_rebalance(Stats, Node)}; @@ -159,6 +182,7 @@ local_status_field_format_order() -> coordinator_node, connection_eviction_rate, session_eviction_rate, + purge_rate, connection_goal, session_goal, disconnected_session_goal, @@ -201,6 +225,8 @@ format_local_status_field({connection_eviction_rate, ConnEvictRate}) -> io_lib:format("Connection eviction rate: ~p connections/second~n", [ConnEvictRate]); format_local_status_field({session_eviction_rate, SessEvictRate}) -> io_lib:format("Session eviction rate: ~p sessions/second~n", [SessEvictRate]); +format_local_status_field({purge_rate, PurgeRate}) -> + io_lib:format("Purge rate: ~p sessions/second~n", [PurgeRate]); format_local_status_field({connection_goal, ConnGoal}) -> io_lib:format("Connection goal: ~p~n", [ConnGoal]); format_local_status_field({session_goal, SessGoal}) -> @@ -231,8 +257,21 @@ format_local_stats(Stats) -> ) ]. +do_local_status([{Type, Get, Cont} | Rest]) -> + case Get() of + disabled -> + do_local_status(Rest); + {enabled, Status} -> + {Type, Cont(Status)} + end; +do_local_status([]) -> + disabled. + evacuation_status() -> {node(), emqx_node_rebalance_evacuation:status()}. +purge_status() -> + {node(), emqx_node_rebalance_purge:status()}. + rebalance_status() -> {node(), emqx_node_rebalance:status()}. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl index cfaccc4c2..8ec16c1e7 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl @@ -15,6 +15,7 @@ start_link() -> init([]) -> Childs = [ + child_spec(emqx_node_rebalance_purge, []), child_spec(emqx_node_rebalance_evacuation, []), child_spec(emqx_node_rebalance_agent, []), child_spec(emqx_node_rebalance, []) diff --git a/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v2.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v2.erl new file mode 100644 index 000000000..2b5b4bca3 --- /dev/null +++ b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v2.erl @@ -0,0 +1,59 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api_proto_v2). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + node_rebalance_evacuation_start/2, + node_rebalance_evacuation_stop/1, + + node_rebalance_start/2, + node_rebalance_stop/1, + + %% Introduced in v2: + node_rebalance_purge_start/2, + node_rebalance_purge_stop/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.2.1". + +-spec node_rebalance_evacuation_start(node(), emqx_node_rebalance_evacuation:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_evacuation:start_error()). +node_rebalance_evacuation_start(Node, #{} = Opts) -> + rpc:call(Node, emqx_node_rebalance_evacuation, start, [Opts]). + +-spec node_rebalance_evacuation_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(not_started). +node_rebalance_evacuation_stop(Node) -> + rpc:call(Node, emqx_node_rebalance_evacuation, stop, []). + +-spec node_rebalance_start(node(), emqx_node_rebalance:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance:start_error()). +node_rebalance_start(Node, Opts) -> + rpc:call(Node, emqx_node_rebalance, start, [Opts]). + +-spec node_rebalance_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(not_started). +node_rebalance_stop(Node) -> + rpc:call(Node, emqx_node_rebalance, stop, []). + +%% Introduced in v2: + +-spec node_rebalance_purge_start(node(), emqx_node_rebalance_purge:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_purge:start_error()). +node_rebalance_purge_start(Node, #{} = Opts) -> + rpc:call(Node, emqx_node_rebalance_purge, start, [Opts]). + +-spec node_rebalance_purge_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_purge:stop_error()). +node_rebalance_purge_stop(Node) -> + rpc:call(Node, emqx_node_rebalance_purge, stop, []). diff --git a/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v2.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v2.erl new file mode 100644 index 000000000..ca8233288 --- /dev/null +++ b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v2.erl @@ -0,0 +1,84 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_proto_v2). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + available_nodes/1, + evict_connections/2, + evict_sessions/4, + connection_counts/1, + session_counts/1, + enable_rebalance_agent/2, + disable_rebalance_agent/2, + disconnected_session_counts/1, + + %% Introduced in v2: + enable_rebalance_agent/3, + disable_rebalance_agent/3, + purge_sessions/2 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.2.1". + +-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()). +available_nodes(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, is_node_available, []). + +-spec evict_connections([node()], non_neg_integer()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +evict_connections(Nodes, Count) -> + rpc:multicall(Nodes, emqx_eviction_agent, evict_connections, [Count]). + +-spec evict_sessions([node()], non_neg_integer(), [node()], emqx_channel:conn_state()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +evict_sessions(Nodes, Count, RecipientNodes, ConnState) -> + rpc:multicall(Nodes, emqx_eviction_agent, evict_sessions, [Count, RecipientNodes, ConnState]). + +-spec connection_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +connection_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, connection_count, []). + +-spec session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +session_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, session_count, []). + +-spec enable_rebalance_agent([node()], pid()) -> + emqx_rpc:multicall_result(ok_or_error(already_enabled | eviction_agent_busy)). +enable_rebalance_agent(Nodes, OwnerPid) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, enable, [OwnerPid]). + +-spec disable_rebalance_agent([node()], pid()) -> + emqx_rpc:multicall_result(ok_or_error(already_disabled | invalid_coordinator)). +disable_rebalance_agent(Nodes, OwnerPid) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, disable, [OwnerPid]). + +-spec disconnected_session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +disconnected_session_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, disconnected_session_count, []). + +%% Introduced in v2: + +-spec enable_rebalance_agent([node()], pid(), emqx_eviction_agent:kind()) -> + emqx_rpc:multicall_result(ok_or_error(already_enabled | eviction_agent_busy)). +enable_rebalance_agent(Nodes, OwnerPid, Kind) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, enable, [OwnerPid, Kind]). + +-spec disable_rebalance_agent([node()], pid(), emqx_eviction_agent:kind()) -> + emqx_rpc:multicall_result(ok_or_error(already_disabled | invalid_coordinator)). +disable_rebalance_agent(Nodes, OwnerPid, Kind) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, disable, [OwnerPid, Kind]). + +-spec purge_sessions([node()], non_neg_integer()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +purge_sessions(Nodes, Count) -> + rpc:multicall(Nodes, emqx_eviction_agent, purge_sessions, [Count]). diff --git a/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_purge_proto_v1.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_purge_proto_v1.erl new file mode 100644 index 000000000..8b2b63bc4 --- /dev/null +++ b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_purge_proto_v1.erl @@ -0,0 +1,29 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_purge_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + start/2, + stop/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.2.1". + +-spec start([node()], emqx_node_rebalance_purge:start_opts()) -> + emqx_rpc:erpc_multicall(ok | {error, emqx_node_rebalance_purge:start_error()}). +start(Nodes, Opts) -> + erpc:multicall(Nodes, emqx_node_rebalance_purge, start, [Opts]). + +-spec stop([node()]) -> + emqx_rpc:erpc_multicall(ok | {error, emqx_node_rebalance_purge:stop_error()}). +stop(Nodes) -> + erpc:multicall(Nodes, emqx_node_rebalance_purge, stop, []). diff --git a/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v2.erl b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v2.erl new file mode 100644 index 000000000..af8981b59 --- /dev/null +++ b/apps/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v2.erl @@ -0,0 +1,46 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_status_proto_v2). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + local_status/1, + rebalance_status/1, + evacuation_status/1, + + %% Introduced in v2: + purge_status/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.2.1". + +-spec local_status(node()) -> + emqx_rpc:badrpc() | disabled | {evacuation, map()} | {rebalance, map()}. +local_status(Node) -> + rpc:call(Node, emqx_node_rebalance_status, local_status, []). + +-spec rebalance_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +rebalance_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, rebalance_status, []). + +-spec evacuation_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +evacuation_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, evacuation_status, []). + +%% Introduced in v2: + +-spec purge_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +purge_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, purge_status, []). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl index 188e6bf71..37227c15e 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -38,32 +38,35 @@ end_per_suite(_Config) -> ok. init_per_testcase(Case, Config) -> - [{DonorNode, _} | _] = - ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( - [ - {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, - {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} - ], - ?START_APPS, - [{emqx, data_dir, case_specific_data_dir(Case, Config)}] + DonorNode = case_specific_node_name(?MODULE, Case, '_donor'), + RecipientNode = case_specific_node_name(?MODULE, Case, '_recipient'), + Spec = #{ + role => core, + join_to => emqx_cth_cluster:node_name(DonorNode), + listeners => true, + apps => app_specs() + }, + Cluster = [{Node, Spec} || Node <- [DonorNode, RecipientNode]], + ClusterNodes = + [Node1 | _] = emqx_cth_cluster:start( + Cluster, + #{work_dir => ?config(priv_dir, Config)} ), - - ok = rpc:call(DonorNode, emqx_mgmt_api_test_util, init_suite, []), - ok = take_auth_header_from(DonorNode), - + ok = rpc:call(Node1, emqx_mgmt_api_test_util, init_suite, []), + ok = take_auth_header_from(Node1), [{cluster_nodes, ClusterNodes} | Config]. end_per_testcase(_Case, Config) -> - _ = emqx_eviction_agent_test_helpers:stop_cluster( - ?config(cluster_nodes, Config), - ?START_APPS - ). + Nodes = ?config(cluster_nodes, Config), + erpc:multicall(Nodes, meck, unload, []), + _ = emqx_cth_cluster:stop(Nodes), + ok. %%-------------------------------------------------------------------- %% Tests %%-------------------------------------------------------------------- t_start_evacuation_validation(Config) -> - [{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + [DonorNode, RecipientNode] = ?config(cluster_nodes, Config), BadOpts = [ #{conn_evict_rate => <<"conn">>}, #{sess_evict_rate => <<"sess">>}, @@ -117,10 +120,86 @@ t_start_evacuation_validation(Config) -> api_get(["load_rebalance", "global_status"]) ). +t_start_purge_validation(Config) -> + [Node1 | _] = ?config(cluster_nodes, Config), + Port1 = get_mqtt_port(Node1, tcp), + BadOpts = [ + #{purge_rate => <<"conn">>}, + #{purge_rate => 0}, + #{purge_rate => -1}, + #{purge_rate => 1.1}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", atom_to_list(Node1), "purge", "start"], + Opts + ), + Opts + ) + end, + BadOpts + ), + ?assertMatch( + {ok, 404, #{}}, + api_post( + ["load_rebalance", "bad@node", "purge", "start"], + #{} + ) + ), + + process_flag(trap_exit, true), + Conns = emqtt_connect_many(Port1, 100), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(Node1), "purge", "start"], + #{purge_rate => 10} + ) + ), + + Node1Bin = atom_to_binary(Node1), + ?assertMatch( + {ok, 200, #{<<"purges">> := [#{<<"node">> := Node1Bin}]}}, + api_get(["load_rebalance", "global_status"]) + ), + + ?assertMatch( + {ok, 200, #{ + <<"process">> := <<"purge">>, + <<"purge_rate">> := 10, + <<"session_goal">> := 0, + <<"state">> := <<"purging">>, + <<"stats">> := + #{ + <<"current_sessions">> := _, + <<"initial_sessions">> := 100 + } + }}, + api_get(["load_rebalance", "status"]) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(Node1), "purge", "stop"], + #{} + ) + ), + + ok = stop_many(Conns), + + ok. + t_start_rebalance_validation(Config) -> process_flag(trap_exit, true), - [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + [DonorNode, RecipientNode] = ?config(cluster_nodes, Config), + DonorPort = get_mqtt_port(DonorNode, tcp), BadOpts = [ #{conn_evict_rate => <<"conn">>}, @@ -189,7 +268,7 @@ t_start_rebalance_validation(Config) -> ok = stop_many(Conns). t_start_stop_evacuation(Config) -> - [{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + [DonorNode, RecipientNode] = ?config(cluster_nodes, Config), StartOpts = maps:merge( maps:get(evacuation, emqx_node_rebalance_api:rebalance_evacuation_example()), @@ -284,7 +363,8 @@ t_start_stop_evacuation(Config) -> t_start_stop_rebalance(Config) -> process_flag(trap_exit, true), - [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + [DonorNode, RecipientNode] = ?config(cluster_nodes, Config), + DonorPort = get_mqtt_port(DonorNode, tcp), ?assertMatch( {ok, 200, #{<<"status">> := <<"disabled">>}}, @@ -390,7 +470,7 @@ t_start_stop_rebalance(Config) -> ok = stop_many(Conns). t_availability_check(Config) -> - [{DonorNode, _} | _] = ?config(cluster_nodes, Config), + [DonorNode | _] = ?config(cluster_nodes, Config), ?assertMatch( {ok, 200, #{}}, api_get(["load_rebalance", "availability_check"]) @@ -425,7 +505,12 @@ api_get(Path) -> api_post(Path, Data) -> case request(post, uri(Path), Data) of {ok, Code, ResponseBody} -> - {ok, Code, jiffy:decode(ResponseBody, [return_maps])}; + Res = + case emqx_utils_json:safe_decode(ResponseBody, [return_maps]) of + {ok, Decoded} -> Decoded; + {error, _} -> ResponseBody + end, + {ok, Code, Res}; {error, _} = Error -> Error end. @@ -444,3 +529,26 @@ case_specific_data_dir(Case, Config) -> undefined -> undefined; PrivDir -> filename:join(PrivDir, atom_to_list(Case)) end. + +app_specs() -> + [ + {emqx, #{ + before_start => fun() -> + emqx_app:set_config_loader(?MODULE) + end, + override_env => [{boot_modules, [broker, listeners]}] + }}, + {emqx_retainer, #{ + config => + #{ + retainer => + #{enable => true} + } + }}, + emqx_eviction_agent, + emqx_node_rebalance + ]. + +get_mqtt_port(Node, Type) -> + {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), + Port. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl index 54ecad026..363254298 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl @@ -156,6 +156,80 @@ t_evacuation(_Config) -> emqx_node_rebalance_evacuation:status() ). +t_purge(_Config) -> + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--purge", "--foo-bar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--purge", "--purge-rate", "foobar"]) + ), + + %% not used by this scenario + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--purge", "--conn-evict-rate", "1"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--purge", "--sess-evict-rate", "1"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--purge", "--wait-takeover", "1"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--purge", + "--migrate-to", + atom_to_list(node()) + ]) + ), + with_some_sessions(fun() -> + ?assert( + emqx_node_rebalance_cli:cli([ + "start", + "--purge", + "--purge-rate", + "10" + ]) + ), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + ?assertMatch( + {enabled, #{}}, + emqx_node_rebalance_purge:status() + ), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--purge", + "--purge-rate", + "10" + ]) + ), + true = emqx_node_rebalance_cli:cli(["stop"]), + ok + end), + %% stop + + false = emqx_node_rebalance_cli:cli(["stop"]), + + ?assertEqual( + disabled, + emqx_node_rebalance_purge:status() + ), + + ok. + t_rebalance(Config) -> process_flag(trap_exit, true), @@ -289,3 +363,19 @@ emqx_node_rebalance_cli(Node, Args) -> Result -> Result end. + +%% to avoid it finishing too fast +with_some_sessions(Fn) -> + emqx_common_test_helpers:with_mock( + emqx_eviction_agent, + status, + fun() -> + case meck:passthrough([]) of + {enabled, Status = #{sessions := _}} -> + {enabled, Status#{sessions := 100}}; + Res -> + Res + end + end, + Fn + ). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl new file mode 100644 index 000000000..2dd3ee93e --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl @@ -0,0 +1,353 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_purge_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [ + emqtt_connect/1, + emqtt_try_connect/1, + case_specific_node_name/3 + ] +). + +all() -> + [{group, one_node}, {group, two_nodes}]. + +groups() -> + [ + {one_node, [], one_node_cases()}, + {two_nodes, [], two_nodes_cases()} + ]. + +two_nodes_cases() -> + [ + t_already_started_two, + t_session_purged + ]. + +one_node_cases() -> + emqx_common_test_helpers:all(?MODULE) -- two_nodes_cases(). + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([]), + ok. + +init_per_group(one_node, Config) -> + [{cluster_type, one_node} | Config]; +init_per_group(two_nodes, Config) -> + [{cluster_type, two_nodes} | Config]. + +end_per_group(_Group, _Config) -> + ok. + +init_per_testcase(TestCase, Config) -> + ct:timetrap({seconds, 30}), + Nodes = + [Node1 | _] = + case ?config(cluster_type, Config) of + one_node -> + [case_specific_node_name(?MODULE, TestCase, '_1')]; + two_nodes -> + [ + case_specific_node_name(?MODULE, TestCase, '_1'), + case_specific_node_name(?MODULE, TestCase, '_2') + ] + end, + Spec = #{ + role => core, + join_to => emqx_cth_cluster:node_name(Node1), + listeners => true, + apps => app_specs() + }, + Cluster = [{Node, Spec} || Node <- Nodes], + ClusterNodes = emqx_cth_cluster:start( + Cluster, + #{work_dir => ?config(priv_dir, Config)} + ), + ok = snabbkaffe:start_trace(), + [{cluster_nodes, ClusterNodes} | Config]. + +end_per_testcase(_TestCase, Config) -> + Nodes = ?config(cluster_nodes, Config), + ok = snabbkaffe:stop(), + erpc:multicall(Nodes, meck, unload, []), + ok = emqx_cth_cluster:stop(Nodes), + ok. + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +app_specs() -> + [ + {emqx, #{ + before_start => fun() -> + emqx_app:set_config_loader(?MODULE) + end, + override_env => [{boot_modules, [broker, listeners]}] + }}, + {emqx_retainer, #{ + config => + #{ + retainer => + #{enable => true} + } + }}, + emqx_eviction_agent, + emqx_node_rebalance + ]. + +opts(_Config) -> + #{ + purge_rate => 10 + }. + +case_specific_data_dir(Case, Config) -> + case ?config(priv_dir, Config) of + undefined -> undefined; + PrivDir -> filename:join(PrivDir, atom_to_list(Case)) + end. + +get_mqtt_port(Node, Type) -> + {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), + Port. + +%% to avoid it finishing too fast +with_some_sessions(Node, Fn) -> + erpc:call(Node, fun() -> + emqx_common_test_helpers:with_mock( + emqx_eviction_agent, + status, + fun() -> + case meck:passthrough([]) of + {enabled, Status = #{sessions := _}} -> + {enabled, Status#{sessions := 100}}; + Res -> + Res + end + end, + Fn + ) + end). + +drain_exits([ClientPid | Rest]) -> + receive + {'EXIT', ClientPid, _Reason} -> + drain_exits(Rest) + after 1_000 -> + ct:pal("mailbox:\n ~p", [process_info(self(), messages)]), + ct:fail("pid ~p didn't die", [ClientPid]) + end; +drain_exits([]) -> + ok. + +emqtt_connect_many(Port, Count) -> + emqtt_connect_many(Port, Count, _StartN = 1). + +%% start many clients with mixed clean_start flags +emqtt_connect_many(Port, Count, StartN) -> + lists:map( + fun(N) -> + NBin = integer_to_binary(N), + ClientId = <<"client-", NBin/binary>>, + CleanStart = N rem 2 == 0, + {ok, C} = emqtt_connect([{clientid, ClientId}, {clean_start, CleanStart}, {port, Port}]), + C + end, + lists:seq(StartN, StartN + Count - 1) + ). + +%%-------------------------------------------------------------------- +%% Test Cases : one node +%%-------------------------------------------------------------------- + +t_agent_busy(Config) -> + [Node] = ?config(cluster_nodes, Config), + + ok = rpc:call(Node, emqx_eviction_agent, enable, [other_rebalance, undefined]), + + erpc:call(Node, fun() -> + ?assertExit( + {{{bad_nodes, [{Node, {error, eviction_agent_busy}}]}, _}, _}, + emqx_node_rebalance_purge:start(opts(Config)) + ) + end), + + ok. + +t_already_started(Config) -> + [Node] = ?config(cluster_nodes, Config), + with_some_sessions(Node, fun() -> + ok = emqx_node_rebalance_purge:start(opts(Config)), + + ?assertEqual( + {error, already_started}, + emqx_node_rebalance_purge:start(opts(Config)) + ), + + ?assertEqual( + ok, + emqx_node_rebalance_purge:stop() + ), + + ok + end), + ok. + +t_not_started(Config) -> + [Node] = ?config(cluster_nodes, Config), + + ?assertEqual( + {error, not_started}, + rpc:call(Node, emqx_node_rebalance_purge, stop, []) + ). + +t_start(Config) -> + [Node] = ?config(cluster_nodes, Config), + Port = get_mqtt_port(Node, tcp), + + with_some_sessions(Node, fun() -> + process_flag(trap_exit, true), + ok = snabbkaffe:start_trace(), + + ?assertEqual( + ok, + emqx_node_rebalance_purge:start(opts(Config)) + ), + ?assertEqual({error, {use_another_server, #{}}}, emqtt_try_connect([{port, Port}])), + ok + end), + ok. + +t_non_persistence(Config) -> + [Node] = ?config(cluster_nodes, Config), + Port = get_mqtt_port(Node, tcp), + + %% to avoid it finishing too fast + with_some_sessions(Node, fun() -> + process_flag(trap_exit, true), + ok = snabbkaffe:start_trace(), + + ?assertEqual( + ok, + emqx_node_rebalance_purge:start(opts(Config)) + ), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, Port}]) + ), + + ok = supervisor:terminate_child(emqx_node_rebalance_sup, emqx_node_rebalance_purge), + {ok, _} = supervisor:restart_child(emqx_node_rebalance_sup, emqx_node_rebalance_purge), + + ?assertMatch( + ok, + emqtt_try_connect([{port, Port}]) + ), + ?assertMatch(disabled, emqx_node_rebalance_purge:status()), + ok + end), + ok. + +t_unknown_messages(Config) -> + process_flag(trap_exit, true), + + [Node] = ?config(cluster_nodes, Config), + + ok = rpc:call(Node, emqx_node_rebalance_purge, start, [opts(Config)]), + Pid = rpc:call(Node, erlang, whereis, [emqx_node_rebalance_purge]), + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ), + + ok. + +%%-------------------------------------------------------------------- +%% Test Cases : two nodes +%%-------------------------------------------------------------------- + +t_already_started_two(Config) -> + [Node1, _Node2] = ?config(cluster_nodes, Config), + with_some_sessions(Node1, fun() -> + ok = emqx_node_rebalance_purge:start(opts(Config)), + + ?assertEqual( + {error, already_started}, + emqx_node_rebalance_purge:start(opts(Config)) + ), + + ?assertEqual( + ok, + emqx_node_rebalance_purge:stop() + ), + + ok + end), + ?assertEqual( + {error, not_started}, + rpc:call(Node1, emqx_node_rebalance_purge, stop, []) + ), + + ok. + +t_session_purged(Config) -> + process_flag(trap_exit, true), + + [Node1, Node2] = ?config(cluster_nodes, Config), + Port1 = get_mqtt_port(Node1, tcp), + Port2 = get_mqtt_port(Node2, tcp), + + Node1Clients = emqtt_connect_many(Port1, 20, _StartN1 = 1), + Node2Clients = emqtt_connect_many(Port2, 20, _StartN2 = 21), + lists:foreach( + fun(C) -> + ClientId = proplists:get_value(clientid, emqtt:info(C)), + Topic = emqx_topic:join([<<"t">>, ClientId]), + Props = #{}, + Payload = ClientId, + Opts = [{retain, true}], + ok = emqtt:publish(C, Topic, Props, Payload, Opts), + {ok, _, [?RC_GRANTED_QOS_0]} = emqtt:subscribe(C, Topic), + ok + end, + Node1Clients ++ Node2Clients + ), + + ?assertEqual(40, erpc:call(Node2, emqx_retainer, retained_count, [])), + + {ok, SRef0} = snabbkaffe:subscribe( + ?match_event(#{?snk_kind := "cluster_purge_done"}), + 15_000 + ), + %% ok = rpc:call(Node1, emqx_node_rebalance_purge, start_global, [Nodes, opts(Config)]), + ok = rpc:call(Node1, emqx_node_rebalance_purge, start, [opts(Config)]), + {ok, _} = snabbkaffe:receive_events(SRef0), + + ?assertEqual([], erpc:call(Node1, emqx_cm, all_channels, [])), + ?assertEqual([], erpc:call(Node2, emqx_cm, all_channels, [])), + ?assertEqual(0, erpc:call(Node1, emqx_retainer, retained_count, [])), + ?assertEqual(0, erpc:call(Node2, emqx_retainer, retained_count, [])), + + ok = drain_exits(Node1Clients ++ Node2Clients), + + ok. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl index 167c37d8c..f9c50b761 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_status_SUITE.erl @@ -57,7 +57,7 @@ end_per_suite(Config) -> t_cluster_status(Config) -> [CoreNode, ReplicantNode] = ?config(cluster_nodes, Config), - ok = emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_start(CoreNode, #{}), + ok = emqx_node_rebalance_api_proto_v2:node_rebalance_evacuation_start(CoreNode, #{}), ?assertMatch( #{evacuations := [_], rebalances := []}, diff --git a/changes/ee/feat-11447.en.md b/changes/ee/feat-11447.en.md new file mode 100644 index 000000000..caa808861 --- /dev/null +++ b/changes/ee/feat-11447.en.md @@ -0,0 +1 @@ +Added CLI command to wipe session and retained message data on the whole cluster. diff --git a/rel/i18n/emqx_node_rebalance_api.hocon b/rel/i18n/emqx_node_rebalance_api.hocon index 8b598134a..00346d52c 100644 --- a/rel/i18n/emqx_node_rebalance_api.hocon +++ b/rel/i18n/emqx_node_rebalance_api.hocon @@ -42,6 +42,18 @@ load_rebalance_evacuation_stop.desc: load_rebalance_evacuation_stop.label: """Stop evacuation""" +cluster_purge_start.desc: +"""Start purge process""" + +cluster_purge_start.label: +"""Start purge""" + +cluster_purge_stop.desc: +"""Stop purge process""" + +cluster_purge_stop.label: +"""Stop purge""" + param_node.desc: """Node name""" @@ -150,6 +162,12 @@ local_status_session_eviction_rate.desc: local_status_session_eviction_rate.label: """Session eviction rate""" +local_status_purge_rate.desc: +"""The rate of purging sessions, in sessions per second""" + +local_status_purge_rate.label: +"""Session purge rate""" + local_status_connection_goal.desc: """The number of connections that the node should have after the rebalance/evacuation process""" From f988de4ff4f3a7fade88cd713d08dfeaa19f8365 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 22 Aug 2023 10:54:44 -0300 Subject: [PATCH 47/85] feat(purge): clear delayed messages --- apps/emqx/priv/bpapi.versions | 2 + .../src/emqx_eviction_agent.erl | 33 ++++++++++++- .../proto/emqx_eviction_agent_proto_v2.erl | 35 ++++++++++++++ apps/emqx_modules/src/emqx_delayed.erl | 28 ++++++++--- .../src/proto/emqx_delayed_proto_v2.erl | 48 +++++++++++++++++++ apps/emqx_modules/test/emqx_delayed_SUITE.erl | 6 +-- .../src/emqx_node_rebalance_purge.erl | 22 ++++----- .../test/emqx_node_rebalance_cli_SUITE.erl | 11 +---- .../test/emqx_node_rebalance_purge_SUITE.erl | 31 +++++++----- 9 files changed, 174 insertions(+), 42 deletions(-) create mode 100644 apps/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v2.erl create mode 100644 apps/emqx_modules/src/proto/emqx_delayed_proto_v2.erl diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index cb9c3ad82..876fe66e0 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -15,7 +15,9 @@ {emqx_conf,3}. {emqx_dashboard,1}. {emqx_delayed,1}. +{emqx_delayed,2}. {emqx_eviction_agent,1}. +{emqx_eviction_agent,2}. {emqx_exhook,1}. {emqx_ft_storage_exporter_fs,1}. {emqx_ft_storage_fs,1}. diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl index 7376c8069..ab2b9e66a 100644 --- a/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl @@ -18,6 +18,7 @@ disable/1, status/0, connection_count/0, + all_channels_count/0, session_count/0, session_count/1, evict_connections/1, @@ -27,6 +28,9 @@ evict_session_channel/3 ]). +%% RPC targets +-export([all_local_channels_count/0]). + -behaviour(gen_server). -export([ @@ -240,6 +244,33 @@ channel_with_session_table(RequiredConnState) -> RequiredConnState =:= ConnState ]). +-spec all_channels_count() -> non_neg_integer(). +all_channels_count() -> + Nodes = emqx:running_nodes(), + Timeout = 15_000, + Results = emqx_eviction_agent_proto_v2:all_channels_count(Nodes, Timeout), + NodeResults = lists:zip(Nodes, Results), + Errors = lists:filter( + fun + ({_Node, {ok, _}}) -> false; + ({_Node, _Err}) -> true + end, + NodeResults + ), + Errors =/= [] andalso + ?SLOG( + warning, + #{ + msg => "error_collecting_all_channels_count", + errors => maps:from_list(Errors) + } + ), + lists:sum([N || {ok, N} <- Results]). + +-spec all_local_channels_count() -> non_neg_integer(). +all_local_channels_count() -> + table_count(emqx_cm:all_channels_table(?CONN_MODULES)). + session_count() -> session_count(any). @@ -303,7 +334,7 @@ evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) -> client_info => ClientInfo } ), - case emqx_eviction_agent_proto_v1:evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) of + case emqx_eviction_agent_proto_v2:evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) of {badrpc, Reason} -> ?SLOG( error, diff --git a/apps/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v2.erl b/apps/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v2.erl new file mode 100644 index 000000000..2d204079c --- /dev/null +++ b/apps/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v2.erl @@ -0,0 +1,35 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_proto_v2). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + evict_session_channel/4, + + %% Introduced in v2: + all_channels_count/2 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.2.1". + +-spec evict_session_channel( + node(), + emqx_types:clientid(), + emqx_types:conninfo(), + emqx_types:clientinfo() +) -> supervisor:startchild_err() | emqx_rpc:badrpc(). +evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) -> + rpc:call(Node, emqx_eviction_agent, evict_session_channel, [ClientId, ConnInfo, ClientInfo]). + +%% Introduced in v2: +-spec all_channels_count([node()], time:time()) -> emqx_rpc:erpc_multicall(non_neg_integer()). +all_channels_count(Nodes, Timeout) -> + erpc:multicall(Nodes, emqx_eviction_agent, all_local_channels_count, [], Timeout). diff --git a/apps/emqx_modules/src/emqx_delayed.erl b/apps/emqx_modules/src/emqx_delayed.erl index 559648bdd..22d18c180 100644 --- a/apps/emqx_modules/src/emqx_delayed.erl +++ b/apps/emqx_modules/src/emqx_delayed.erl @@ -45,18 +45,22 @@ code_change/3 ]). -%% gen_server callbacks +%% API -export([ load/0, unload/0, load_or_unload/1, get_conf/1, update_config/1, + delayed_count/0, list/1, get_delayed_message/1, get_delayed_message/2, delete_delayed_message/1, delete_delayed_message/2, + clear_all/0, + %% rpc target + clear_all_local/0, cluster_list/1 ]). @@ -167,6 +171,9 @@ unload() -> load_or_unload(Bool) -> gen_server:call(?SERVER, {do_load_or_unload, Bool}). +-spec delayed_count() -> non_neg_integer(). +delayed_count() -> mnesia:table_info(?TAB, size). + list(Params) -> emqx_mgmt_api:paginate(?TAB, Params, ?FORMAT_FUN). @@ -243,7 +250,7 @@ get_delayed_message(Id) -> get_delayed_message(Node, Id) when Node =:= node() -> get_delayed_message(Id); get_delayed_message(Node, Id) -> - emqx_delayed_proto_v1:get_delayed_message(Node, Id). + emqx_delayed_proto_v2:get_delayed_message(Node, Id). -spec delete_delayed_message(binary()) -> with_id_return(). delete_delayed_message(Id) -> @@ -258,7 +265,19 @@ delete_delayed_message(Id) -> delete_delayed_message(Node, Id) when Node =:= node() -> delete_delayed_message(Id); delete_delayed_message(Node, Id) -> - emqx_delayed_proto_v1:delete_delayed_message(Node, Id). + emqx_delayed_proto_v2:delete_delayed_message(Node, Id). + +-spec clear_all() -> ok. +clear_all() -> + Nodes = emqx:running_nodes(), + _ = emqx_delayed_proto_v2:clear_all(Nodes), + ok. + +%% rpc target +-spec clear_all_local() -> ok. +clear_all_local() -> + _ = mria:clear_table(?TAB), + ok. update_config(Config) -> emqx_conf:update([delayed], Config, #{rawconf_with_defaults => true, override_to => cluster}). @@ -408,9 +427,6 @@ do_publish(Key = {Ts, _Id}, Now, Acc) when Ts =< Now -> end, do_publish(mnesia:dirty_next(?TAB, Key), Now, [Key | Acc]). --spec delayed_count() -> non_neg_integer(). -delayed_count() -> mnesia:table_info(?TAB, size). - do_load_or_unload(true, State) -> emqx_hooks:put('message.publish', {?MODULE, on_message_publish, []}, ?HP_DELAY_PUB), State; diff --git a/apps/emqx_modules/src/proto/emqx_delayed_proto_v2.erl b/apps/emqx_modules/src/proto/emqx_delayed_proto_v2.erl new file mode 100644 index 000000000..6d96dcc66 --- /dev/null +++ b/apps/emqx_modules/src/proto/emqx_delayed_proto_v2.erl @@ -0,0 +1,48 @@ +%%-------------------------------------------------------------------- +%%Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_delayed_proto_v2). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + get_delayed_message/2, + delete_delayed_message/2, + + %% Introduced in v2: + clear_all/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.2.1". + +-spec get_delayed_message(node(), binary()) -> + emqx_delayed:with_id_return(map()) | emqx_rpc:badrpc(). +get_delayed_message(Node, Id) -> + rpc:call(Node, emqx_delayed, get_delayed_message, [Id]). + +-spec delete_delayed_message(node(), binary()) -> emqx_delayed:with_id_return() | emqx_rpc:badrpc(). +delete_delayed_message(Node, Id) -> + rpc:call(Node, emqx_delayed, delete_delayed_message, [Id]). + +%% Introduced in v2: + +-spec clear_all([node()]) -> emqx_rpc:erpc_multicall(ok). +clear_all(Nodes) -> + erpc:multicall(Nodes, emqx_delayed, clear_all_local, []). diff --git a/apps/emqx_modules/test/emqx_delayed_SUITE.erl b/apps/emqx_modules/test/emqx_delayed_SUITE.erl index 56eff8bfc..5085aa2da 100644 --- a/apps/emqx_modules/test/emqx_delayed_SUITE.erl +++ b/apps/emqx_modules/test/emqx_delayed_SUITE.erl @@ -164,15 +164,15 @@ t_cluster(_) -> ?assertMatch( {ok, _}, - emqx_delayed_proto_v1:get_delayed_message(node(), Id) + emqx_delayed_proto_v2:get_delayed_message(node(), Id) ), %% The 'local' and the 'fake-remote' values should be the same, %% however there is a race condition, so we are just assert that they are both 'ok' tuples ?assertMatch({ok, _}, emqx_delayed:get_delayed_message(Id)), - ?assertMatch({ok, _}, emqx_delayed_proto_v1:get_delayed_message(node(), Id)), + ?assertMatch({ok, _}, emqx_delayed_proto_v2:get_delayed_message(node(), Id)), - ok = emqx_delayed_proto_v1:delete_delayed_message(node(), Id), + ok = emqx_delayed_proto_v2:delete_delayed_message(node(), Id), ?assertMatch( {error, _}, diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl index 71820266c..81f1bfe03 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_purge.erl @@ -43,7 +43,7 @@ %% gen_statem states -define(disabled, disabled). -define(purging, purging). --define(cleaning_retained_messages, cleaning_retained_messages). +-define(cleaning_data, cleaning_data). -type start_opts() :: #{ purge_rate => pos_integer() @@ -80,7 +80,7 @@ start_link() -> callback_mode() -> handle_event_function. -%% states: disabled, purging, cleaning_retained_messages +%% states: disabled, purging, cleaning_data init([]) -> {ok, disabled, #{}}. @@ -130,35 +130,35 @@ handle_event( purge_rate := PurgeRate } = Data ) -> - case emqx_eviction_agent:status() of - {enabled, #{sessions := Sessions}} when Sessions > 0 -> + case emqx_eviction_agent:all_channels_count() of + Sessions when Sessions > 0 -> ok = purge_sessions(PurgeRate), - ?tp(debug, cluster_purge_evict_session, #{purge_rate => PurgeRate}), - ?SLOG( + ?tp( warning, + "cluster_purge_evict_sessions", #{ - msg => "cluster_purge_evict_sessions", count => Sessions, purge_rate => PurgeRate } ), NewData = Data#{current_sessions => Sessions}, {keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, purge}]}; - {enabled, #{sessions := 0}} -> + _Sessions = 0 -> NewData = Data#{current_conns => 0}, ?SLOG(warning, #{msg => "cluster_purge_evict_sessions_done"}), - {next_state, ?cleaning_retained_messages, NewData, [ + {next_state, ?cleaning_data, NewData, [ {state_timeout, 0, clean_retained_messages} ]} end; handle_event( state_timeout, clean_retained_messages, - ?cleaning_retained_messages, + ?cleaning_data, Data ) -> - ?SLOG(warning, #{msg => "cluster_purge_cleaning_retained_messages"}), + ?SLOG(warning, #{msg => "cluster_purge_cleaning_data"}), ok = emqx_retainer:clean(), + ok = emqx_delayed:clear_all(), ?tp(warning, "cluster_purge_done", #{}), ok = disable_purge(), ?tp(warning, "cluster_purge_finished_successfully", #{}), diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl index 363254298..7d0cab0ce 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl @@ -368,14 +368,7 @@ emqx_node_rebalance_cli(Node, Args) -> with_some_sessions(Fn) -> emqx_common_test_helpers:with_mock( emqx_eviction_agent, - status, - fun() -> - case meck:passthrough([]) of - {enabled, Status = #{sessions := _}} -> - {enabled, Status#{sessions := 100}}; - Res -> - Res - end - end, + all_channels_count, + fun() -> 100 end, Fn ). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl index 2dd3ee93e..7cdcc4d71 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_purge_SUITE.erl @@ -109,6 +109,10 @@ app_specs() -> #{enable => true} } }}, + {emqx_modules, #{ + config => + #{delayed => #{enable => true}} + }}, emqx_eviction_agent, emqx_node_rebalance ]. @@ -133,15 +137,8 @@ with_some_sessions(Node, Fn) -> erpc:call(Node, fun() -> emqx_common_test_helpers:with_mock( emqx_eviction_agent, - status, - fun() -> - case meck:passthrough([]) of - {enabled, Status = #{sessions := _}} -> - {enabled, Status#{sessions := 100}}; - Res -> - Res - end - end, + all_channels_count, + fun() -> 100 end, Fn ) end). @@ -317,8 +314,13 @@ t_session_purged(Config) -> Port1 = get_mqtt_port(Node1, tcp), Port2 = get_mqtt_port(Node2, tcp), - Node1Clients = emqtt_connect_many(Port1, 20, _StartN1 = 1), - Node2Clients = emqtt_connect_many(Port2, 20, _StartN2 = 21), + %% N.B.: it's important to have an asymmetric number of clients for this test, as + %% otherwise the scenario might happen to finish successfully due to the wrong + %% reasons! + NumClientsNode1 = 5, + NumClientsNode2 = 35, + Node1Clients = emqtt_connect_many(Port1, NumClientsNode1, _StartN1 = 1), + Node2Clients = emqtt_connect_many(Port2, NumClientsNode2, _StartN2 = 21), lists:foreach( fun(C) -> ClientId = proplists:get_value(clientid, emqtt:info(C)), @@ -327,6 +329,8 @@ t_session_purged(Config) -> Payload = ClientId, Opts = [{retain, true}], ok = emqtt:publish(C, Topic, Props, Payload, Opts), + DelayedTopic = emqx_topic:join([<<"$delayed/120">>, Topic]), + ok = emqtt:publish(C, DelayedTopic, Payload), {ok, _, [?RC_GRANTED_QOS_0]} = emqtt:subscribe(C, Topic), ok end, @@ -334,12 +338,13 @@ t_session_purged(Config) -> ), ?assertEqual(40, erpc:call(Node2, emqx_retainer, retained_count, [])), + ?assertEqual(NumClientsNode1, erpc:call(Node1, emqx_delayed, delayed_count, [])), + ?assertEqual(NumClientsNode2, erpc:call(Node2, emqx_delayed, delayed_count, [])), {ok, SRef0} = snabbkaffe:subscribe( ?match_event(#{?snk_kind := "cluster_purge_done"}), 15_000 ), - %% ok = rpc:call(Node1, emqx_node_rebalance_purge, start_global, [Nodes, opts(Config)]), ok = rpc:call(Node1, emqx_node_rebalance_purge, start, [opts(Config)]), {ok, _} = snabbkaffe:receive_events(SRef0), @@ -347,6 +352,8 @@ t_session_purged(Config) -> ?assertEqual([], erpc:call(Node2, emqx_cm, all_channels, [])), ?assertEqual(0, erpc:call(Node1, emqx_retainer, retained_count, [])), ?assertEqual(0, erpc:call(Node2, emqx_retainer, retained_count, [])), + ?assertEqual(0, erpc:call(Node1, emqx_delayed, delayed_count, [])), + ?assertEqual(0, erpc:call(Node2, emqx_delayed, delayed_count, [])), ok = drain_exits(Node1Clients ++ Node2Clients), From 5908b69353df8041b7c2c5789316bbbdd05b5500 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Tue, 22 Aug 2023 16:30:15 -0300 Subject: [PATCH 48/85] chore: hide cluster purge from cli and api for now --- .../src/emqx_node_rebalance_api.erl | 83 ++++++++++--------- .../src/emqx_node_rebalance_cli.erl | 11 +-- .../test/emqx_node_rebalance_api_SUITE.erl | 3 +- 3 files changed, 51 insertions(+), 46 deletions(-) diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl index ddeebf188..a9dc535ad 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -68,9 +68,10 @@ paths() -> "/load_rebalance/:node/start", "/load_rebalance/:node/stop", "/load_rebalance/:node/evacuation/start", - "/load_rebalance/:node/evacuation/stop", - "/load_rebalance/:node/purge/start", - "/load_rebalance/:node/purge/stop" + "/load_rebalance/:node/evacuation/stop" + %% TODO: uncomment after we officially release the feature. + %% "/load_rebalance/:node/purge/start", + %% "/load_rebalance/:node/purge/stop" ]. schema("/load_rebalance/status") -> @@ -179,42 +180,43 @@ schema("/load_rebalance/:node/evacuation/stop") -> 404 => error_codes([?NOT_FOUND], <<"Not Found">>) } } - }; -schema("/load_rebalance/:node/purge/start") -> - #{ - 'operationId' => '/load_rebalance/:node/purge/start', - post => #{ - tags => [<<"load_rebalance">>], - summary => <<"Start purge on the whole cluster">>, - description => ?DESC("cluster_purge_start"), - parameters => [param_node()], - 'requestBody' => - emqx_dashboard_swagger:schema_with_examples( - ref(purge_start), - purge_example() - ), - responses => #{ - 200 => response_schema(), - 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), - 404 => error_codes([?NOT_FOUND], <<"Not Found">>) - } - } - }; -schema("/load_rebalance/:node/purge/stop") -> - #{ - 'operationId' => '/load_rebalance/:node/purge/stop', - post => #{ - tags => [<<"load_rebalance">>], - summary => <<"Stop purge on the whole cluster">>, - description => ?DESC("cluster_purge_stop"), - parameters => [param_node()], - responses => #{ - 200 => response_schema(), - 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), - 404 => error_codes([?NOT_FOUND], <<"Not Found">>) - } - } }. +%% TODO: uncomment after we officially release the feature. +%% schema("/load_rebalance/:node/purge/start") -> +%% #{ +%% 'operationId' => '/load_rebalance/:node/purge/start', +%% post => #{ +%% tags => [<<"load_rebalance">>], +%% summary => <<"Start purge on the whole cluster">>, +%% description => ?DESC("cluster_purge_start"), +%% parameters => [param_node()], +%% 'requestBody' => +%% emqx_dashboard_swagger:schema_with_examples( +%% ref(purge_start), +%% purge_example() +%% ), +%% responses => #{ +%% 200 => response_schema(), +%% 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), +%% 404 => error_codes([?NOT_FOUND], <<"Not Found">>) +%% } +%% } +%% }; +%% schema("/load_rebalance/:node/purge/stop") -> +%% #{ +%% 'operationId' => '/load_rebalance/:node/purge/stop', +%% post => #{ +%% tags => [<<"load_rebalance">>], +%% summary => <<"Stop purge on the whole cluster">>, +%% description => ?DESC("cluster_purge_stop"), +%% parameters => [param_node()], +%% responses => #{ +%% 200 => response_schema(), +%% 400 => error_codes([?BAD_REQUEST], <<"Bad Request">>), +%% 404 => error_codes([?NOT_FOUND], <<"Not Found">>) +%% } +%% } +%% }. %%-------------------------------------------------------------------- %% Handlers @@ -849,8 +851,9 @@ rebalance_evacuation_example() -> } }. -purge_example() -> - #{purge => #{purge_rate => 100}}. +%% TODO: uncomment after we officially release the feature. +%% purge_example() -> +%% #{purge => #{purge_rate => 100}}. local_status_response_schema() -> hoconsc:union([ref(local_status_disabled), ref(local_status_enabled)]). diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl index 9e591eb5c..9e0a173ea 100644 --- a/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl @@ -138,11 +138,12 @@ cli(_) -> "Start current node evacuation with optional server redirect to the specified servers" }, - { - "rebalance start --purge \\\n" - " [--purge-rate CountPerSec]", - "Start purge on all running nodes in the cluster" - }, + %% TODO: uncomment after we officially release the feature. + %% { + %% "rebalance start --purge \\\n" + %% " [--purge-rate CountPerSec]", + %% "Start purge on all running nodes in the cluster" + %% }, { "rebalance start \\\n" diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl index 37227c15e..017e85971 100644 --- a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -120,7 +120,8 @@ t_start_evacuation_validation(Config) -> api_get(["load_rebalance", "global_status"]) ). -t_start_purge_validation(Config) -> +%% TODO: uncomment after we officially release the feature. +skipped_t_start_purge_validation(Config) -> [Node1 | _] = ?config(cluster_nodes, Config), Port1 = get_mqtt_port(Node1, tcp), BadOpts = [ From 007274914313f30f31d2d192aefc832a7dd2bd7b Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 23 Aug 2023 19:32:16 +0300 Subject: [PATCH 49/85] fix(emqx_trace): don't download empty trace log file Closes: EMQX-10274 --- apps/emqx/src/emqx_trace/emqx_trace.erl | 25 +++- apps/emqx/test/emqx_trace_SUITE.erl | 32 ++++ .../src/emqx_mgmt_api_trace.erl | 141 ++++++++++-------- .../test/emqx_mgmt_api_trace_SUITE.erl | 29 +++- changes/ce/fix-11506.en.md | 4 + 5 files changed, 166 insertions(+), 65 deletions(-) create mode 100644 changes/ce/fix-11506.en.md diff --git a/apps/emqx/src/emqx_trace/emqx_trace.erl b/apps/emqx/src/emqx_trace/emqx_trace.erl index 9ce6f9d38..4fff45229 100644 --- a/apps/emqx/src/emqx_trace/emqx_trace.erl +++ b/apps/emqx/src/emqx_trace/emqx_trace.erl @@ -160,7 +160,7 @@ create(Trace) -> end; false -> {error, - "The number of traces created has reache the maximum" + "The number of traces created has reached the maximum" " please delete the useless ones first"} end. @@ -371,10 +371,16 @@ start_trace(Trace) -> stop_trace(Finished, Started) -> lists:foreach( - fun(#{name := Name, type := Type, filter := Filter}) -> + fun(#{name := Name, id := HandlerID, dst := FilePath, type := Type, filter := Filter}) -> case lists:member(Name, Finished) of true -> - ?TRACE("API", "trace_stopping", #{Type => Filter}), + _ = maybe_sync_logfile(HandlerID), + case file:read_file_info(FilePath) of + {ok, #file_info{size = Size}} when Size > 0 -> + ?TRACE("API", "trace_stopping", #{Type => Filter}); + _ -> + ok + end, emqx_trace_handler:uninstall(Type, Name); false -> ok @@ -383,6 +389,19 @@ stop_trace(Finished, Started) -> Started ). +maybe_sync_logfile(HandlerID) -> + case logger:get_handler_config(HandlerID) of + {ok, #{module := Mod}} -> + case erlang:function_exported(Mod, filesync, 1) of + true -> + Mod:filesync(HandlerID); + false -> + ok + end; + _ -> + ok + end. + clean_stale_trace_files() -> TraceDir = trace_dir(), case file:list_dir(TraceDir) of diff --git a/apps/emqx/test/emqx_trace_SUITE.erl b/apps/emqx/test/emqx_trace_SUITE.erl index ce7d7e887..7e932a1d0 100644 --- a/apps/emqx/test/emqx_trace_SUITE.erl +++ b/apps/emqx/test/emqx_trace_SUITE.erl @@ -24,6 +24,7 @@ -include_lib("emqx/include/emqx.hrl"). -include_lib("emqx/include/emqx_trace.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("kernel/include/file.hrl"). %%-------------------------------------------------------------------- %% Setups @@ -52,6 +53,7 @@ init_per_testcase(_, Config) -> Config. end_per_testcase(_) -> + snabbkaffe:stop(), ok. t_base_create_delete(_Config) -> @@ -454,6 +456,36 @@ t_migrate_trace(_Config) -> ), ok. +%% If no relevant event occurred, the log file size must be exactly 0 after stopping the trace. +t_empty_trace_log_file(_Config) -> + ?check_trace( + begin + Now = erlang:system_time(second), + Name = <<"empty_trace_log">>, + Trace = [ + {<<"name">>, Name}, + {<<"type">>, clientid}, + {<<"clientid">>, <<"test_trace_no_clientid_1">>}, + {<<"start_at">>, Now}, + {<<"end_at">>, Now + 100} + ], + ?wait_async_action( + ?assertMatch({ok, _}, emqx_trace:create(Trace)), + #{?snk_kind := update_trace_done} + ), + ok = emqx_trace_handler_SUITE:filesync(Name, clientid), + {ok, Filename} = emqx_trace:get_trace_filename(Name), + ?assertMatch({ok, #{size := 0}}, emqx_trace:trace_file_detail(Filename)), + ?wait_async_action( + ?assertEqual(ok, emqx_trace:update(Name, false)), + #{?snk_kind := update_trace_done} + ), + ?assertMatch({ok, #{size := 0}}, emqx_trace:trace_file_detail(Filename)), + ?assertEqual(ok, emqx_trace:delete(Name)) + end, + [] + ). + build_new_trace_data() -> Now = erlang:system_time(second), {ok, _} = emqx_trace:create([ diff --git a/apps/emqx_management/src/emqx_mgmt_api_trace.erl b/apps/emqx_management/src/emqx_mgmt_api_trace.erl index 17adf7460..bcc21a97b 100644 --- a/apps/emqx_management/src/emqx_mgmt_api_trace.erl +++ b/apps/emqx_management/src/emqx_mgmt_api_trace.erl @@ -22,6 +22,7 @@ -include_lib("emqx/include/logger.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). -include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx_utils/include/emqx_utils_api.hrl"). -export([ api_spec/0, @@ -51,8 +52,7 @@ -define(MAX_SINT32, 2147483647). -define(TO_BIN(_B_), iolist_to_binary(_B_)). --define(NOT_FOUND(N), {404, #{code => 'NOT_FOUND', message => ?TO_BIN([N, " NOT FOUND"])}}). --define(SERVICE_UNAVAILABLE(C, M), {503, #{code => C, message => ?TO_BIN(M)}}). +-define(NOT_FOUND_WITH_MSG(N), ?NOT_FOUND(?TO_BIN([N, " NOT FOUND"]))). -define(TAGS, [<<"Trace">>]). namespace() -> "trace". @@ -476,13 +476,13 @@ format_trace(Trace0) -> delete_trace(delete, #{bindings := #{name := Name}}) -> case emqx_trace:delete(Name) of ok -> {204}; - {error, not_found} -> ?NOT_FOUND(Name) + {error, not_found} -> ?NOT_FOUND_WITH_MSG(Name) end. update_trace(put, #{bindings := #{name := Name}}) -> case emqx_trace:update(Name, false) of ok -> {200, #{enable => false, name => Name}}; - {error, not_found} -> ?NOT_FOUND(Name) + {error, not_found} -> ?NOT_FOUND_WITH_MSG(Name) end. %% if HTTP request headers include accept-encoding: gzip and file size > 300 bytes. @@ -493,64 +493,85 @@ download_trace_log(get, #{bindings := #{name := Name}, query_string := Query}) - case parse_node(Query, undefined) of {ok, Node} -> TraceFiles = collect_trace_file(Node, TraceLog), - %% We generate a session ID so that we name files - %% with unique names. Then we won't cause - %% overwrites for concurrent requests. - SessionId = emqx_utils:gen_id(), - ZipDir = filename:join([emqx_trace:zip_dir(), SessionId]), - ok = file:make_dir(ZipDir), - %% Write files to ZipDir and create an in-memory zip file - Zips = group_trace_file(ZipDir, TraceLog, TraceFiles), - ZipName = binary_to_list(Name) ++ ".zip", - Binary = - try - {ok, {ZipName, Bin}} = zip:zip(ZipName, Zips, [memory, {cwd, ZipDir}]), - Bin - after - %% emqx_trace:delete_files_after_send(ZipFileName, Zips), - %% TODO use file replace file_binary.(delete file after send is not ready now). - ok = file:del_dir_r(ZipDir) - end, - ?tp(trace_api_download_trace_log, #{ - files => Zips, - name => Name, - session_id => SessionId, - zip_dir => ZipDir, - zip_name => ZipName - }), - Headers = #{ - <<"content-type">> => <<"application/x-zip">>, - <<"content-disposition">> => iolist_to_binary( - "attachment; filename=" ++ ZipName - ) - }, - {200, Headers, {file_binary, ZipName, Binary}}; + maybe_download_trace_log(Name, TraceLog, TraceFiles); {error, not_found} -> - ?NOT_FOUND(<<"Node">>) + ?NOT_FOUND_WITH_MSG(<<"Node">>) end; {error, not_found} -> - ?NOT_FOUND(Name) + ?NOT_FOUND_WITH_MSG(Name) end. +maybe_download_trace_log(Name, TraceLog, TraceFiles) -> + case group_trace_files(TraceLog, TraceFiles) of + #{nonempty := Files} -> + do_download_trace_log(Name, TraceLog, Files); + #{error := Reasons} -> + ?INTERNAL_ERROR(Reasons); + #{empty := _} -> + ?NOT_FOUND(<<"Trace is empty">>) + end. + +do_download_trace_log(Name, TraceLog, TraceFiles) -> + %% We generate a session ID so that we name files + %% with unique names. Then we won't cause + %% overwrites for concurrent requests. + SessionId = emqx_utils:gen_id(), + ZipDir = filename:join([emqx_trace:zip_dir(), SessionId]), + ok = file:make_dir(ZipDir), + %% Write files to ZipDir and create an in-memory zip file + Zips = group_trace_file(ZipDir, TraceLog, TraceFiles), + ZipName = binary_to_list(Name) ++ ".zip", + Binary = + try + {ok, {ZipName, Bin}} = zip:zip(ZipName, Zips, [memory, {cwd, ZipDir}]), + Bin + after + %% emqx_trace:delete_files_after_send(ZipFileName, Zips), + %% TODO use file replace file_binary.(delete file after send is not ready now). + ok = file:del_dir_r(ZipDir) + end, + ?tp(trace_api_download_trace_log, #{ + files => Zips, + name => Name, + session_id => SessionId, + zip_dir => ZipDir, + zip_name => ZipName + }), + Headers = #{ + <<"content-type">> => <<"application/x-zip">>, + <<"content-disposition">> => iolist_to_binary( + "attachment; filename=" ++ ZipName + ) + }, + {200, Headers, {file_binary, ZipName, Binary}}. + +group_trace_files(TraceLog, TraceFiles) -> + maps:groups_from_list( + fun + ({ok, _Node, <<>>}) -> + empty; + ({ok, _Node, _Bin}) -> + nonempty; + ({error, Node, Reason}) -> + ?SLOG(error, #{ + msg => "download_trace_log_error", + node => Node, + log => TraceLog, + reason => Reason + }), + error + end, + TraceFiles + ). + group_trace_file(ZipDir, TraceLog, TraceFiles) -> lists:foldl( - fun(Res, Acc) -> - case Res of - {ok, Node, Bin} -> - FileName = Node ++ "-" ++ TraceLog, - ZipName = filename:join([ZipDir, FileName]), - case file:write_file(ZipName, Bin) of - ok -> [FileName | Acc]; - _ -> Acc - end; - {error, Node, Reason} -> - ?SLOG(error, #{ - msg => "download_trace_log_error", - node => Node, - log => TraceLog, - reason => Reason - }), - Acc + fun({ok, Node, Bin}, Acc) -> + FileName = Node ++ "-" ++ TraceLog, + ZipName = filename:join([ZipDir, FileName]), + case file:write_file(ZipName, Bin) of + ok -> [FileName | Acc]; + _ -> Acc end end, [], @@ -578,7 +599,7 @@ log_file_detail(get, #{bindings := #{name := Name}}) -> TraceFiles = collect_trace_file_detail(TraceLog), {200, group_trace_file_detail(TraceFiles)}; {error, not_found} -> - ?NOT_FOUND(Name) + ?NOT_FOUND_WITH_MSG(Name) end. group_trace_file_detail(TraceLogDetail) -> @@ -609,7 +630,7 @@ stream_log_file(get, #{bindings := #{name := Name}, query_string := Query}) -> Meta = #{<<"position">> => Position, <<"bytes">> => Bytes}, {200, #{meta => Meta, items => <<"">>}}; {error, not_found} -> - ?NOT_FOUND(Name); + ?NOT_FOUND_WITH_MSG(Name); {error, enomem} -> ?SLOG(warning, #{ code => not_enough_mem, @@ -617,12 +638,12 @@ stream_log_file(get, #{bindings := #{name := Name}, query_string := Query}) -> bytes => Bytes, name => Name }), - ?SERVICE_UNAVAILABLE('SERVICE_UNAVAILABLE', <<"Requested chunk size too big">>); + ?SERVICE_UNAVAILABLE(<<"Requested chunk size too big">>); {badrpc, nodedown} -> - ?NOT_FOUND(<<"Node">>) + ?NOT_FOUND_WITH_MSG(<<"Node">>) end; {error, not_found} -> - ?NOT_FOUND(<<"Node">>) + ?NOT_FOUND_WITH_MSG(<<"Node">>) end. -spec get_trace_size() -> #{{node(), file:name_all()} => non_neg_integer()}. diff --git a/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl index 8f9a4a5ca..f4725b453 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_trace_SUITE.erl @@ -369,6 +369,28 @@ t_trace_files_are_deleted_after_download(_Config) -> ), ok. +t_download_empty_trace(_Config) -> + ClientId = <<"client-test-empty-trace-download">>, + Now = erlang:system_time(second), + Name = <<"test_client_id_empty_trace">>, + load(), + create_trace(Name, ClientId, Now), + ok = emqx_trace_handler_SUITE:filesync(Name, clientid), + ?check_trace( + begin + ?wait_async_action( + ?assertMatch( + {ok, _}, request_api(put, api_path(<<"trace/", Name/binary, "/stop">>), #{}) + ), + #{?snk_kind := update_trace_done} + ) + end, + [] + ), + {error, {{_, 404, _}, _Headers, Body}} = + request_api(get, api_path(<<"trace/", Name/binary, "/download">>), [], #{return_all => true}), + ?assertMatch(#{<<"message">> := <<"Trace is empty">>}, emqx_utils_json:decode(Body)). + to_rfc3339(Second) -> list_to_binary(calendar:system_time_to_rfc3339(Second)). @@ -376,8 +398,11 @@ request_api(Method, Url) -> request_api(Method, Url, []). request_api(Method, Url, Body) -> - Opts = #{httpc_req_opts => [{body_format, binary}]}, - emqx_mgmt_api_test_util:request_api(Method, Url, [], [], Body, Opts). + request_api(Method, Url, Body, #{}). + +request_api(Method, Url, Body, Opts) -> + Opts1 = Opts#{httpc_req_opts => [{body_format, binary}]}, + emqx_mgmt_api_test_util:request_api(Method, Url, [], [], Body, Opts1). api_path(Path) -> emqx_mgmt_api_test_util:api_path([Path]). diff --git a/changes/ce/fix-11506.en.md b/changes/ce/fix-11506.en.md new file mode 100644 index 000000000..7341134ac --- /dev/null +++ b/changes/ce/fix-11506.en.md @@ -0,0 +1,4 @@ +Don't download a trace log file if it is empty. + +After this fix, GET `/api/v5/trace/clientempty/download` returns 404 `{"code":"NOT_FOUND","message":"Trace is empty"}` +If no events matching the trace condition occurred. From 279895b8fd9482da249db6fa95a40a3a329982f6 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Mon, 28 Aug 2023 22:45:05 +0300 Subject: [PATCH 50/85] chore(ft): add read/write concurrency to emqx_ft_async_reply tabs --- apps/emqx_ft/src/emqx_ft_async_reply.erl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/emqx_ft/src/emqx_ft_async_reply.erl b/apps/emqx_ft/src/emqx_ft_async_reply.erl index f33558434..7ac4c527f 100644 --- a/apps/emqx_ft/src/emqx_ft_async_reply.erl +++ b/apps/emqx_ft/src/emqx_ft_async_reply.erl @@ -55,8 +55,15 @@ -spec create_tables() -> ok. create_tables() -> - _ = ets:new(?MON_TAB, [named_table, public, ordered_set]), - _ = ets:new(?PACKET_TAB, [named_table, public, ordered_set]), + EtsOptions = [ + named_table, + public, + ordered_set, + {read_concurrency, true}, + {write_concurrency, true} + ], + _ = ets:new(?MON_TAB, EtsOptions), + _ = ets:new(?PACKET_TAB, EtsOptions), ok. -spec register(packet_id(), mon_ref(), timer_ref()) -> ok. From 4245a4d8c6421f1079904fbb2980eff1ff3649d6 Mon Sep 17 00:00:00 2001 From: zhongwencool Date: Mon, 28 Aug 2023 18:21:56 +0800 Subject: [PATCH 51/85] fix: authz clean-cache clientid always return not_found --- apps/emqx_ctl/src/emqx_ctl.app.src | 2 +- apps/emqx_ctl/src/emqx_ctl.erl | 3 +-- apps/emqx_management/src/emqx_mgmt_cli.erl | 8 ++++--- .../test/emqx_mgmt_cli_SUITE.erl | 21 +++++++++++++++++-- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/apps/emqx_ctl/src/emqx_ctl.app.src b/apps/emqx_ctl/src/emqx_ctl.app.src index 1196f17a5..c3a55ef7b 100644 --- a/apps/emqx_ctl/src/emqx_ctl.app.src +++ b/apps/emqx_ctl/src/emqx_ctl.app.src @@ -1,6 +1,6 @@ {application, emqx_ctl, [ {description, "Backend for emqx_ctl script"}, - {vsn, "0.1.2"}, + {vsn, "0.1.3"}, {registered, []}, {mod, {emqx_ctl_app, []}}, {applications, [ diff --git a/apps/emqx_ctl/src/emqx_ctl.erl b/apps/emqx_ctl/src/emqx_ctl.erl index d1a7ed1d7..a51f4919c 100644 --- a/apps/emqx_ctl/src/emqx_ctl.erl +++ b/apps/emqx_ctl/src/emqx_ctl.erl @@ -119,8 +119,7 @@ run_command(Cmd, Args) when is_atom(Cmd) -> case lookup_command(Cmd) of [{Mod, Fun}] -> try - _ = apply(Mod, Fun, [Args]), - ok + apply(Mod, Fun, [Args]) catch _:Reason:Stacktrace -> ?LOG_ERROR(#{ diff --git a/apps/emqx_management/src/emqx_mgmt_cli.erl b/apps/emqx_management/src/emqx_mgmt_cli.erl index aeed5b922..1b6c614ce 100644 --- a/apps/emqx_management/src/emqx_mgmt_cli.erl +++ b/apps/emqx_management/src/emqx_mgmt_cli.erl @@ -707,7 +707,7 @@ authz(["cache-clean", "all"]) -> with_log(fun emqx_mgmt:clean_authz_cache_all/0, Msg); authz(["cache-clean", ClientId]) -> Msg = io_lib:format("Drain ~ts authz cache", [ClientId]), - with_log(fun() -> emqx_mgmt:clean_authz_cache(ClientId) end, Msg); + with_log(fun() -> emqx_mgmt:clean_authz_cache(list_to_binary(ClientId)) end, Msg); authz(_) -> emqx_ctl:usage( [ @@ -921,12 +921,14 @@ for_node(Fun, Node) -> end. with_log(Fun, Msg) -> - case Fun() of + Res = Fun(), + case Res of ok -> emqx_ctl:print("~s OK~n", [Msg]); {error, Reason} -> emqx_ctl:print("~s FAILED~n~p~n", [Msg, Reason]) - end. + end, + Res. cluster_info() -> RunningNodes = safe_call_mria(running_nodes, [], []), diff --git a/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl index 33292e54e..405890729 100644 --- a/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_cli_SUITE.erl @@ -25,6 +25,7 @@ all() -> init_per_suite(Config) -> emqx_mgmt_api_test_util:init_suite([emqx_conf, emqx_management]), + ok = emqx_mgmt_cli:load(), Config. end_per_suite(_) -> @@ -183,9 +184,25 @@ t_listeners(_Config) -> t_authz(_Config) -> %% authz cache-clean all # Clears authorization cache on all nodes - emqx_ctl:run_command(["authz", "cache-clean", "all"]), - %% authz cache-clean node # Clears authorization cache on given node + ?assertMatch(ok, emqx_ctl:run_command(["authz", "cache-clean", "all"])), + ClientId = "authz_clean_test", + ClientIdBin = list_to_binary(ClientId), %% authz cache-clean # Clears authorization cache for given client + ?assertMatch({error, not_found}, emqx_ctl:run_command(["authz", "cache-clean", ClientId])), + {ok, C} = emqtt:start_link([{clean_start, true}, {clientid, ClientId}]), + {ok, _} = emqtt:connect(C), + {ok, _, _} = emqtt:subscribe(C, <<"topic/1">>, 1), + [Pid] = emqx_cm:lookup_channels(ClientIdBin), + ?assertMatch([_], gen_server:call(Pid, list_authz_cache)), + + ?assertMatch(ok, emqx_ctl:run_command(["authz", "cache-clean", ClientId])), + ?assertMatch([], gen_server:call(Pid, list_authz_cache)), + %% authz cache-clean node # Clears authorization cache on given node + {ok, _, _} = emqtt:subscribe(C, <<"topic/2">>, 1), + ?assertMatch([_], gen_server:call(Pid, list_authz_cache)), + ?assertMatch(ok, emqx_ctl:run_command(["authz", "cache-clean", "node", atom_to_list(node())])), + ?assertMatch([], gen_server:call(Pid, list_authz_cache)), + ok = emqtt:disconnect(C), ok. t_olp(_Config) -> From 2967b709ae449749043b3f7fcccdd674c9fb2085 Mon Sep 17 00:00:00 2001 From: zhongwencool Date: Tue, 29 Aug 2023 08:20:48 +0800 Subject: [PATCH 52/85] chore: add changelog for PR(11531) --- changes/ce/fix-11531.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ce/fix-11531.en.md diff --git a/changes/ce/fix-11531.en.md b/changes/ce/fix-11531.en.md new file mode 100644 index 000000000..00296433a --- /dev/null +++ b/changes/ce/fix-11531.en.md @@ -0,0 +1 @@ +Fixed issue where authorization cache cleaning cli was not working properly for specific client ID. From 6722722522713cd7c8f6f87d474d646aaf6e39fa Mon Sep 17 00:00:00 2001 From: firest Date: Mon, 28 Aug 2023 18:43:17 +0800 Subject: [PATCH 53/85] fix(frame): improve some error reasons when parsing invalid packet --- apps/emqx/src/emqx_frame.erl | 16 ++++++++++++---- apps/emqx/test/emqx_ws_connection_SUITE.erl | 13 +++++++++++-- changes/ce/perf-11532.en.md | 1 + 3 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 changes/ce/perf-11532.en.md diff --git a/apps/emqx/src/emqx_frame.erl b/apps/emqx/src/emqx_frame.erl index 8620f834f..33b0ccbfa 100644 --- a/apps/emqx/src/emqx_frame.erl +++ b/apps/emqx/src/emqx_frame.erl @@ -472,8 +472,8 @@ parse_packet( ) -> {Properties, <<>>} = parse_properties(Rest, ?MQTT_PROTO_V5, StrictMode), #mqtt_packet_auth{reason_code = ReasonCode, properties = Properties}; -parse_packet(_Header, _FrameBin, _Options) -> - ?PARSE_ERR(malformed_packet). +parse_packet(Header, _FrameBin, _Options) -> + ?PARSE_ERR(#{hit => malformed_packet, header_type => Header#mqtt_packet_header.type}). parse_will_message( Packet = #mqtt_packet_connect{ @@ -512,8 +512,16 @@ parse_properties(<<0, Rest/binary>>, ?MQTT_PROTO_V5, _StrictMode) -> {#{}, Rest}; parse_properties(Bin, ?MQTT_PROTO_V5, StrictMode) -> {Len, Rest} = parse_variable_byte_integer(Bin), - <> = Rest, - {parse_property(PropsBin, #{}, StrictMode), Rest1}. + case Rest of + <> -> + {parse_property(PropsBin, #{}, StrictMode), Rest1}; + _ -> + ?PARSE_ERR(#{ + hint => user_property_not_enough_bytes, + parsed_key_length => Len, + remaining_bytes_length => byte_size(Rest) + }) + end. parse_property(<<>>, Props, _StrictMode) -> Props; diff --git a/apps/emqx/test/emqx_ws_connection_SUITE.erl b/apps/emqx/test/emqx_ws_connection_SUITE.erl index 9860248fe..a6dcdb48d 100644 --- a/apps/emqx/test/emqx_ws_connection_SUITE.erl +++ b/apps/emqx/test/emqx_ws_connection_SUITE.erl @@ -540,8 +540,17 @@ t_parse_incoming_order(_) -> t_parse_incoming_frame_error(_) -> {Packets, _St} = ?ws_conn:parse_incoming(<<3, 2, 1, 0>>, [], st()), - FrameError = {frame_error, malformed_packet}, - [{incoming, FrameError}] = Packets. + + ?assertMatch( + [ + {incoming, + {frame_error, #{ + header_type := _, + hit := malformed_packet + }}} + ], + Packets + ). t_handle_incomming_frame_error(_) -> FrameError = {frame_error, bad_qos}, diff --git a/changes/ce/perf-11532.en.md b/changes/ce/perf-11532.en.md new file mode 100644 index 000000000..d1c8b9266 --- /dev/null +++ b/changes/ce/perf-11532.en.md @@ -0,0 +1 @@ +Improve some error reasons for parsing with invalid packets. From 501456efec6ee43a5499842e4e301f9c54336b32 Mon Sep 17 00:00:00 2001 From: zhongwencool Date: Tue, 29 Aug 2023 17:10:43 +0800 Subject: [PATCH 54/85] chore: more safe with iolist_to_binary Co-authored-by: Zaiming (Stone) Shi --- apps/emqx_management/src/emqx_mgmt_cli.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_management/src/emqx_mgmt_cli.erl b/apps/emqx_management/src/emqx_mgmt_cli.erl index 1b6c614ce..8564653a4 100644 --- a/apps/emqx_management/src/emqx_mgmt_cli.erl +++ b/apps/emqx_management/src/emqx_mgmt_cli.erl @@ -707,7 +707,7 @@ authz(["cache-clean", "all"]) -> with_log(fun emqx_mgmt:clean_authz_cache_all/0, Msg); authz(["cache-clean", ClientId]) -> Msg = io_lib:format("Drain ~ts authz cache", [ClientId]), - with_log(fun() -> emqx_mgmt:clean_authz_cache(list_to_binary(ClientId)) end, Msg); + with_log(fun() -> emqx_mgmt:clean_authz_cache(iolist_to_binary(ClientId)) end, Msg); authz(_) -> emqx_ctl:usage( [ From f57d16ba13a08ef19ef3e104e2770f032aa70ea3 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Tue, 29 Aug 2023 21:12:54 +0400 Subject: [PATCH 55/85] feat(cthsuite): add function to determine workdir of testrun In a deterministic fashion, to lift the burden of undestanding where the testrun's data should go from the test writer. --- apps/emqx/integration_test/emqx_ds_SUITE.erl | 6 +-- apps/emqx/test/emqx_cth_suite.erl | 53 ++++++++++++++++--- apps/emqx/test/emqx_flapping_SUITE.erl | 2 +- .../test/emqx_persistent_messages_SUITE.erl | 3 +- .../emqx_authz/test/emqx_authz_file_SUITE.erl | 2 +- .../test/emqx_authz_rich_actions_SUITE.erl | 2 +- .../test/emqx_bridge_api_SUITE.erl | 16 +++--- apps/emqx_ft/test/emqx_ft_SUITE.erl | 2 +- apps/emqx_ft/test/emqx_ft_conf_SUITE.erl | 3 +- .../emqx_ft/test/emqx_ft_storage_fs_SUITE.erl | 3 +- .../test/emqx_ft_storage_fs_gc_SUITE.erl | 2 +- 11 files changed, 64 insertions(+), 30 deletions(-) diff --git a/apps/emqx/integration_test/emqx_ds_SUITE.erl b/apps/emqx/integration_test/emqx_ds_SUITE.erl index 842782e35..3f0d3f3e4 100644 --- a/apps/emqx/integration_test/emqx_ds_SUITE.erl +++ b/apps/emqx/integration_test/emqx_ds_SUITE.erl @@ -22,7 +22,7 @@ all() -> init_per_suite(Config) -> TCApps = emqx_cth_suite:start( app_specs(), - #{work_dir => ?config(priv_dir, Config)} + #{work_dir => emqx_cth_suite:work_dir(Config)} ), [{tc_apps, TCApps} | Config]. @@ -31,9 +31,9 @@ end_per_suite(Config) -> emqx_cth_suite:stop(TCApps), ok. -init_per_testcase(t_session_subscription_idempotency, Config) -> +init_per_testcase(t_session_subscription_idempotency = TC, Config) -> Cluster = cluster(#{n => 1}), - ClusterOpts = #{work_dir => ?config(priv_dir, Config)}, + ClusterOpts = #{work_dir => emqx_cth_suite:work_dir(TC, Config)}, NodeSpecs = emqx_cth_cluster:mk_nodespecs(Cluster, ClusterOpts), Nodes = emqx_cth_cluster:start(Cluster, ClusterOpts), [ diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index 80b3a578c..b70245e9d 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -22,6 +22,9 @@ -export([start/2]). -export([stop/1]). +-export([work_dir/1]). +-export([work_dir/2]). + -export([load_apps/1]). -export([start_apps/2]). -export([start_app/2]). @@ -98,16 +101,11 @@ when SuiteOpts :: #{ %% Working directory %% Everything a test produces should go here. If this directory is not empty, - %% function will raise an error. + %% function will raise an error. Most of the time, the result of `work_dir/1` + %% or `work_dir/2` (if used in a testcase) should be fine here. work_dir := file:name() }. -start(Apps, SuiteOpts0 = #{work_dir := WorkDir0}) -> - %% when running CT on the whole app, it seems like `priv_dir` is the same on all - %% suites and leads to the "clean slate" verificatin to fail. - WorkDir = binary_to_list( - filename:join([WorkDir0, emqx_guid:to_hexstr(emqx_guid:gen())]) - ), - SuiteOpts = SuiteOpts0#{work_dir := WorkDir}, +start(Apps, SuiteOpts = #{work_dir := WorkDir}) -> % 1. Prepare appspec instructions AppSpecs = [mk_appspec(App, SuiteOpts) || App <- Apps], % 2. Load every app so that stuff scanning attributes of loaded modules works @@ -339,6 +337,45 @@ default_config(App, SuiteOpts) -> %% +%% @doc Determine the unique work directory for the current test run. +%% Takes into account name of the test suite, and all test groups the current run +%% is part of. +-spec work_dir(CTConfig :: proplists:proplist()) -> + file:filename_all(). +work_dir(CTConfig) -> + % Directory specific to the current test run. + [PrivDir] = proplists:get_all_values(priv_dir, CTConfig), + % Directory specific to the currently executing test suite. + [DataDir] = proplists:get_all_values(data_dir, CTConfig), + % NOTE: Contains the name of the current test group, if executed as part of a group. + GroupProps = proplists:get_value(tc_group_properties, CTConfig, []), + % NOTE: Contains names of outer test groups, if any. + GroupPathOuter = proplists:get_value(tc_group_path, CTConfig, []), + SuiteDir = filename:basename(DataDir), + GroupPath = lists:append([GroupProps | GroupPathOuter]), + GroupLevels = [atom_to_list(Name) || {name, Name} <- GroupPath], + WorkDir1 = filename:join(PrivDir, SuiteDir), + WorkDir2 = + case GroupLevels of + [] -> + WorkDir1; + [_ | _] -> + GroupDir = string:join(lists:reverse(GroupLevels), "."), + filename:join(WorkDir1, GroupDir) + end, + WorkDir2. + +%% @doc Determine the unique work directory for the current testcase run. +%% Be careful when testcase runs under no groups, and its name matches the name of a +%% previously executed test group, it's best to avoid such naming. +-spec work_dir(TestCaseName :: atom(), CTConfig :: proplists:proplist()) -> + file:filename_all(). +work_dir(TCName, CTConfig) -> + WorkDir = work_dir(CTConfig), + filename:join(WorkDir, TCName). + +%% + start_ekka() -> ok = emqx_common_test_helpers:start_ekka(), {ok, [mnesia, ekka]}. diff --git a/apps/emqx/test/emqx_flapping_SUITE.erl b/apps/emqx/test/emqx_flapping_SUITE.erl index 6204d9b6d..021eaddbf 100644 --- a/apps/emqx/test/emqx_flapping_SUITE.erl +++ b/apps/emqx/test/emqx_flapping_SUITE.erl @@ -35,7 +35,7 @@ init_per_suite(Config) -> "\n ban_time = 2s" "\n }"} ], - #{work_dir => ?config(priv_dir, Config)} + #{work_dir => emqx_cth_suite:work_dir(Config)} ), [{suite_apps, Apps} | Config]. diff --git a/apps/emqx/test/emqx_persistent_messages_SUITE.erl b/apps/emqx/test/emqx_persistent_messages_SUITE.erl index db22b19e6..c4f7ef73b 100644 --- a/apps/emqx/test/emqx_persistent_messages_SUITE.erl +++ b/apps/emqx/test/emqx_persistent_messages_SUITE.erl @@ -33,10 +33,9 @@ init_per_suite(Config) -> %% TODO: remove after other suites start to use `emx_cth_suite' application:stop(emqx), application:stop(emqx_durable_storage), - WorkDir = ?config(priv_dir, Config), TCApps = emqx_cth_suite:start( app_specs(), - #{work_dir => WorkDir} + #{work_dir => emqx_cth_suite:work_dir(Config)} ), [{tc_apps, TCApps} | Config]. diff --git a/apps/emqx_authz/test/emqx_authz_file_SUITE.erl b/apps/emqx_authz/test/emqx_authz_file_SUITE.erl index 396679783..d31935363 100644 --- a/apps/emqx_authz/test/emqx_authz_file_SUITE.erl +++ b/apps/emqx_authz/test/emqx_authz_file_SUITE.erl @@ -44,7 +44,7 @@ init_per_testcase(TestCase, Config) -> {emqx_conf, "authorization.no_match = deny, authorization.cache.enable = false"}, emqx_authz ], - #{work_dir => filename:join(?config(priv_dir, Config), TestCase)} + #{work_dir => emqx_cth_suite:work_dir(TestCase, Config)} ), [{tc_apps, Apps} | Config]. diff --git a/apps/emqx_authz/test/emqx_authz_rich_actions_SUITE.erl b/apps/emqx_authz/test/emqx_authz_rich_actions_SUITE.erl index 8d24b5472..fc597f15b 100644 --- a/apps/emqx_authz/test/emqx_authz_rich_actions_SUITE.erl +++ b/apps/emqx_authz/test/emqx_authz_rich_actions_SUITE.erl @@ -37,7 +37,7 @@ init_per_testcase(TestCase, Config) -> {emqx_conf, "authorization.no_match = deny, authorization.cache.enable = false"}, emqx_authz ], - #{work_dir => filename:join(?config(priv_dir, Config), TestCase)} + #{work_dir => emqx_cth_suite:work_dir(TestCase, Config)} ), [{tc_apps, Apps} | Config]. diff --git a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl index d8e697987..d08953682 100644 --- a/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl +++ b/apps/emqx_bridge/test/emqx_bridge_api_SUITE.erl @@ -116,13 +116,13 @@ end_per_suite(_Config) -> ok. init_per_group(cluster = Name, Config) -> - Nodes = [NodePrimary | _] = mk_cluster(Name, Config), + Nodes = [NodePrimary | _] = mk_cluster(Config), init_api([{group, Name}, {cluster_nodes, Nodes}, {node, NodePrimary} | Config]); init_per_group(cluster_later_join = Name, Config) -> - Nodes = [NodePrimary | _] = mk_cluster(Name, Config, #{join_to => undefined}), + Nodes = [NodePrimary | _] = mk_cluster(Config, #{join_to => undefined}), init_api([{group, Name}, {cluster_nodes, Nodes}, {node, NodePrimary} | Config]); -init_per_group(Name, Config) -> - WorkDir = filename:join(?config(priv_dir, Config), Name), +init_per_group(_Name, Config) -> + WorkDir = emqx_cth_suite:work_dir(Config), Apps = emqx_cth_suite:start(?APPSPECS ++ [?APPSPEC_DASHBOARD], #{work_dir => WorkDir}), init_api([{group, single}, {group_apps, Apps}, {node, node()} | Config]). @@ -131,10 +131,10 @@ init_api(Config) -> {ok, App} = erpc:call(APINode, emqx_common_test_http, create_default_app, []), [{api, App} | Config]. -mk_cluster(Name, Config) -> - mk_cluster(Name, Config, #{}). +mk_cluster(Config) -> + mk_cluster(Config, #{}). -mk_cluster(Name, Config, Opts) -> +mk_cluster(Config, Opts) -> Node1Apps = ?APPSPECS ++ [?APPSPEC_DASHBOARD], Node2Apps = ?APPSPECS, emqx_cth_cluster:start( @@ -142,7 +142,7 @@ mk_cluster(Name, Config, Opts) -> {emqx_bridge_api_SUITE1, Opts#{role => core, apps => Node1Apps}}, {emqx_bridge_api_SUITE2, Opts#{role => core, apps => Node2Apps}} ], - #{work_dir => filename:join(?config(priv_dir, Config), Name)} + #{work_dir => emqx_cth_suite:work_dir(Config)} ). end_per_group(Group, Config) when diff --git a/apps/emqx_ft/test/emqx_ft_SUITE.erl b/apps/emqx_ft/test/emqx_ft_SUITE.erl index 290cda333..dfee76f72 100644 --- a/apps/emqx_ft/test/emqx_ft_SUITE.erl +++ b/apps/emqx_ft/test/emqx_ft_SUITE.erl @@ -76,7 +76,7 @@ init_per_suite(Config) -> [ {emqx_ft, #{config => emqx_ft_test_helpers:config(Storage)}} ], - #{work_dir => ?config(priv_dir, Config)} + #{work_dir => emqx_cth_suite:work_dir(Config)} ), [{suite_apps, Apps} | Config]. diff --git a/apps/emqx_ft/test/emqx_ft_conf_SUITE.erl b/apps/emqx_ft/test/emqx_ft_conf_SUITE.erl index 3fdfdf65a..0acdea213 100644 --- a/apps/emqx_ft/test/emqx_ft_conf_SUITE.erl +++ b/apps/emqx_ft/test/emqx_ft_conf_SUITE.erl @@ -32,13 +32,12 @@ end_per_suite(_Config) -> ok. init_per_testcase(Case, Config) -> - WorkDir = filename:join(?config(priv_dir, Config), Case), Apps = emqx_cth_suite:start( [ {emqx_conf, #{}}, {emqx_ft, #{config => "file_transfer {}"}} ], - #{work_dir => WorkDir} + #{work_dir => emqx_cth_suite:work_dir(Case, Config)} ), [{suite_apps, Apps} | Config]. diff --git a/apps/emqx_ft/test/emqx_ft_storage_fs_SUITE.erl b/apps/emqx_ft/test/emqx_ft_storage_fs_SUITE.erl index a57cdf621..52d372e63 100644 --- a/apps/emqx_ft/test/emqx_ft_storage_fs_SUITE.erl +++ b/apps/emqx_ft/test/emqx_ft_storage_fs_SUITE.erl @@ -36,12 +36,11 @@ groups() -> init_per_suite(Config) -> Storage = emqx_ft_test_helpers:local_storage(Config), - WorkDir = ?config(priv_dir, Config), Apps = emqx_cth_suite:start( [ {emqx_ft, #{config => emqx_ft_test_helpers:config(Storage)}} ], - #{work_dir => WorkDir} + #{work_dir => emqx_cth_suite:work_dir(Config)} ), [{suite_apps, Apps} | Config]. diff --git a/apps/emqx_ft/test/emqx_ft_storage_fs_gc_SUITE.erl b/apps/emqx_ft/test/emqx_ft_storage_fs_gc_SUITE.erl index b14fc7edd..311ad7fbd 100644 --- a/apps/emqx_ft/test/emqx_ft_storage_fs_gc_SUITE.erl +++ b/apps/emqx_ft/test/emqx_ft_storage_fs_gc_SUITE.erl @@ -28,7 +28,7 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - Apps = emqx_cth_suite:start([emqx], #{work_dir => ?config(priv_dir, Config)}), + Apps = emqx_cth_suite:start([emqx], #{work_dir => emqx_cth_suite:work_dir(Config)}), [{suite_apps, Apps} | Config]. end_per_suite(Config) -> From 326809388182ad23d22fffb437436d033b4939b1 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Tue, 29 Aug 2023 21:16:26 +0400 Subject: [PATCH 56/85] feat(cth): add module-level documenation --- apps/emqx/test/emqx_cth_cluster.erl | 22 ++++++++++++++++ apps/emqx/test/emqx_cth_suite.erl | 41 +++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/apps/emqx/test/emqx_cth_cluster.erl b/apps/emqx/test/emqx_cth_cluster.erl index e24600181..3f8ea9a89 100644 --- a/apps/emqx/test/emqx_cth_cluster.erl +++ b/apps/emqx/test/emqx_cth_cluster.erl @@ -14,6 +14,28 @@ %% limitations under the License. %%-------------------------------------------------------------------- +%% @doc Common Test Helper / Running tests in a cluster +%% +%% This module allows setting up and tearing down clusters of EMQX nodes with +%% the purpose of running integration tests in a distributed environment, but +%% with the same isolation measures that `emqx_cth_suite` provides. +%% +%% Additionally to what `emqx_cth_suite` does with respect to isolation, each +%% node in the cluster is started with a separate, unique working directory. +%% +%% What should be started on each node is defined by the same appspecs that are +%% used by `emqx_cth_suite` to start applications on the CT node. However, there +%% are additional set of defaults applied to appspecs to make sure that the +%% cluster is started in a consistent, interconnected state, with no conflicts +%% between applications. +%% +%% Most of the time, you just need to: +%% 1. Describe the cluster with one or more _nodespecs_. +%% 2. Call `emqx_cth_cluster:start/2` before the testrun (e.g. in `init_per_suite/1` +%% or `init_per_group/2`), providing unique work dir (e.g. +%% `emqx_cth_suite:work_dir/1`). Save the result in a context. +%% 3. Call `emqx_cth_cluster:stop/1` after the testrun concludes (e.g. +%% in `end_per_suite/1` or `end_per_group/2`) with the result from step 2. -module(emqx_cth_cluster). -export([start/2]). diff --git a/apps/emqx/test/emqx_cth_suite.erl b/apps/emqx/test/emqx_cth_suite.erl index b70245e9d..090bca762 100644 --- a/apps/emqx/test/emqx_cth_suite.erl +++ b/apps/emqx/test/emqx_cth_suite.erl @@ -14,6 +14,47 @@ %% limitations under the License. %%-------------------------------------------------------------------- +%% @doc Common Test Helper / Running test suites +%% +%% The purpose of this module is to run application-level, integration +%% tests in an isolated fashion. +%% +%% Isolation is this context means that each testrun does not leave any +%% persistent state accessible to following testruns. The goal is to +%% make testruns completely independent of each other, of the order in +%% which they are executed, and of the testrun granularity, i.e. whether +%% they are executed individually or as part of a larger suite. This +%% should help to increase reproducibility and reduce the risk of false +%% positives. +%% +%% Isolation is achieved through the following measures: +%% * Each testrun completely terminates and unload all applications +%% started during the testrun. +%% * Each testrun is executed in a separate directory, usually under +%% common_test's private directory, where all persistent state should +%% be stored. +%% * Additionally, each cleans out few bits of persistent state that +%% survives the above measures, namely persistent VM terms related +%% to configuration and authentication (see `clean_suite_state/0`). +%% +%% Integration test in this context means a test that works with applications +%% as a whole, and needs to start and stop them as part of the test run. +%% For this, there's an abstraction called _appspec_ that describes how to +%% configure and start an application. +%% +%% The module also provides a set of default appspecs for some applications +%% that hide details and quirks of how to start them, to make it easier to +%% write test suites. +%% +%% Most of the time, you just need to: +%% 1. Describe the appspecs for the applications you want to test. +%% 2. Call `emqx_cth_sutie:start/2` to start the applications before the testrun +%% (e.g. in `init_per_suite/1` / `init_per_group/2`), providing the appspecs +%% and unique work dir for the testrun (e.g. `work_dir/1`). Save the result +%% in a context. +%% 3. Call `emqx_cth_sutie:stop/1` to stop the applications after the testrun +%% finishes (e.g. in `end_per_suite/1` / `end_per_group/2`), providing the +%% result from step 2. -module(emqx_cth_suite). -include_lib("common_test/include/ct.hrl"). From 0e770bdc95fc22e99227ebee777eb458a39a68db Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Tue, 29 Aug 2023 21:17:07 +0400 Subject: [PATCH 57/85] test: switch `emqx_broker_SUITE` to use new cth tooling --- apps/emqx/test/emqx_broker_SUITE.erl | 47 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/apps/emqx/test/emqx_broker_SUITE.erl b/apps/emqx/test/emqx_broker_SUITE.erl index 6e03971a5..52cf230ff 100644 --- a/apps/emqx/test/emqx_broker_SUITE.erl +++ b/apps/emqx/test/emqx_broker_SUITE.erl @@ -58,39 +58,54 @@ groups() -> init_per_group(connected_client_count_group, Config) -> Config; init_per_group(tcp, Config) -> - emqx_common_test_helpers:boot_modules(all), - emqx_common_test_helpers:start_apps([]), - [{conn_fun, connect} | Config]; + Apps = emqx_cth_suite:start( + [emqx], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + [{conn_fun, connect}, {group_apps, Apps} | Config]; init_per_group(ws, Config) -> - emqx_common_test_helpers:boot_modules(all), - emqx_common_test_helpers:start_apps([]), + Apps = emqx_cth_suite:start( + [emqx], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), [ {ssl, false}, {enable_websocket, true}, {conn_fun, ws_connect}, {port, 8083}, - {host, "localhost"} + {host, "localhost"}, + {group_apps, Apps} | Config ]; init_per_group(quic, Config) -> - emqx_common_test_helpers:boot_modules(all), - emqx_common_test_helpers:start_apps([]), - UdpPort = 14567, - ok = emqx_common_test_helpers:ensure_quic_listener(?MODULE, UdpPort), + Apps = emqx_cth_suite:start( + [ + {emqx, + "listeners.quic.test {" + "\n enable = true" + "\n max_connections = 1024000" + "\n idle_timeout = 15s" + "\n }"} + ], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), [ {conn_fun, quic_connect}, - {port, UdpPort} + {port, emqx_config:get([listeners, quic, test, bind])}, + {group_apps, Apps} | Config ]; init_per_group(_Group, Config) -> - emqx_common_test_helpers:boot_modules(all), - emqx_common_test_helpers:start_apps([]), - Config. + Apps = emqx_cth_suite:start( + [emqx], + #{work_dir => emqx_cth_suite:work_dir(Config)} + ), + [{group_apps, Apps} | Config]. end_per_group(connected_client_count_group, _Config) -> ok; -end_per_group(_Group, _Config) -> - emqx_common_test_helpers:stop_apps([]). +end_per_group(_Group, Config) -> + emqx_cth_suite:stop(?config(group_apps, Config)). init_per_suite(Config) -> Config. From 54ac4a85277a4255c562785b6c7c58ad054bd435 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Tue, 29 Aug 2023 23:21:36 +0300 Subject: [PATCH 58/85] chore(ft): tidy up the code according to the review --- apps/emqx/src/emqx_channel.erl | 2 +- apps/emqx_ft/src/emqx_ft_app.erl | 1 - apps/emqx_ft/src/emqx_ft_async_reply.erl | 4 ++-- apps/emqx_ft/src/emqx_ft_sup.erl | 2 ++ 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/emqx/src/emqx_channel.erl b/apps/emqx/src/emqx_channel.erl index 53cac2400..d5941c8e9 100644 --- a/apps/emqx/src/emqx_channel.erl +++ b/apps/emqx/src/emqx_channel.erl @@ -1049,7 +1049,7 @@ handle_out(disconnect, {ReasonCode, ReasonName}, Channel) -> handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel); handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) -> Packet = ?DISCONNECT_PACKET(ReasonCode, Props), - {ok, [?REPLY_OUTGOING(Packet), {close, ReasonName}], Channel}; + {ok, [?REPLY_OUTGOING(Packet), ?REPLY_CLOSE(ReasonName)], Channel}; handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) -> {ok, {close, ReasonName}, Channel}; handle_out(auth, {ReasonCode, Properties}, Channel) -> diff --git a/apps/emqx_ft/src/emqx_ft_app.erl b/apps/emqx_ft/src/emqx_ft_app.erl index 114b4bff3..43a4cc816 100644 --- a/apps/emqx_ft/src/emqx_ft_app.erl +++ b/apps/emqx_ft/src/emqx_ft_app.erl @@ -22,7 +22,6 @@ start(_StartType, _StartArgs) -> {ok, Sup} = emqx_ft_sup:start_link(), - ok = emqx_ft_async_reply:create_tables(), ok = emqx_ft_conf:load(), {ok, Sup}. diff --git a/apps/emqx_ft/src/emqx_ft_async_reply.erl b/apps/emqx_ft/src/emqx_ft_async_reply.erl index 7ac4c527f..501f91629 100644 --- a/apps/emqx_ft/src/emqx_ft_async_reply.erl +++ b/apps/emqx_ft/src/emqx_ft_async_reply.erl @@ -62,8 +62,8 @@ create_tables() -> {read_concurrency, true}, {write_concurrency, true} ], - _ = ets:new(?MON_TAB, EtsOptions), - _ = ets:new(?PACKET_TAB, EtsOptions), + ok = emqx_utils_ets:new(?MON_TAB, EtsOptions), + ok = emqx_utils_ets:new(?PACKET_TAB, EtsOptions), ok. -spec register(packet_id(), mon_ref(), timer_ref()) -> ok. diff --git a/apps/emqx_ft/src/emqx_ft_sup.erl b/apps/emqx_ft/src/emqx_ft_sup.erl index 512d534c3..6d3936cf6 100644 --- a/apps/emqx_ft/src/emqx_ft_sup.erl +++ b/apps/emqx_ft/src/emqx_ft_sup.erl @@ -28,6 +28,8 @@ start_link() -> supervisor:start_link({local, ?SERVER}, ?MODULE, []). init([]) -> + ok = emqx_ft_async_reply:create_tables(), + SupFlags = #{ strategy => one_for_one, intensity => 100, From 804443ba40127d73ba4fa2e5e46515ee59f83107 Mon Sep 17 00:00:00 2001 From: firest Date: Wed, 30 Aug 2023 11:22:09 +0800 Subject: [PATCH 59/85] chore: fix typos --- apps/emqx/src/emqx_frame.erl | 2 +- apps/emqx/test/emqx_ws_connection_SUITE.erl | 2 +- changes/ce/perf-11532.en.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/emqx/src/emqx_frame.erl b/apps/emqx/src/emqx_frame.erl index 33b0ccbfa..20be12c42 100644 --- a/apps/emqx/src/emqx_frame.erl +++ b/apps/emqx/src/emqx_frame.erl @@ -473,7 +473,7 @@ parse_packet( {Properties, <<>>} = parse_properties(Rest, ?MQTT_PROTO_V5, StrictMode), #mqtt_packet_auth{reason_code = ReasonCode, properties = Properties}; parse_packet(Header, _FrameBin, _Options) -> - ?PARSE_ERR(#{hit => malformed_packet, header_type => Header#mqtt_packet_header.type}). + ?PARSE_ERR(#{hint => malformed_packet, header_type => Header#mqtt_packet_header.type}). parse_will_message( Packet = #mqtt_packet_connect{ diff --git a/apps/emqx/test/emqx_ws_connection_SUITE.erl b/apps/emqx/test/emqx_ws_connection_SUITE.erl index a6dcdb48d..37faef12b 100644 --- a/apps/emqx/test/emqx_ws_connection_SUITE.erl +++ b/apps/emqx/test/emqx_ws_connection_SUITE.erl @@ -546,7 +546,7 @@ t_parse_incoming_frame_error(_) -> {incoming, {frame_error, #{ header_type := _, - hit := malformed_packet + hint := malformed_packet }}} ], Packets diff --git a/changes/ce/perf-11532.en.md b/changes/ce/perf-11532.en.md index d1c8b9266..a522f7828 100644 --- a/changes/ce/perf-11532.en.md +++ b/changes/ce/perf-11532.en.md @@ -1 +1 @@ -Improve some error reasons for parsing with invalid packets. +Improve some error reasons when parsing invalid packets. From 5a693240da907170af56a7e4e3d7d62d09e30404 Mon Sep 17 00:00:00 2001 From: Kiril Velikov Date: Fri, 18 Aug 2023 20:12:43 +0000 Subject: [PATCH 60/85] feat(helm-chart): add support for loadBalancerClass Insert loadBalancerClass in the service manifest. Required for clusters with multiple loadbalancers. --- deploy/charts/emqx-enterprise/README.md | 1 + deploy/charts/emqx-enterprise/templates/service.yaml | 3 +++ deploy/charts/emqx-enterprise/values.yaml | 6 +++++- deploy/charts/emqx/README.md | 1 + deploy/charts/emqx/templates/service.yaml | 3 +++ deploy/charts/emqx/values.yaml | 6 +++++- 6 files changed, 18 insertions(+), 2 deletions(-) diff --git a/deploy/charts/emqx-enterprise/README.md b/deploy/charts/emqx-enterprise/README.md index b11159c84..d3977ceac 100644 --- a/deploy/charts/emqx-enterprise/README.md +++ b/deploy/charts/emqx-enterprise/README.md @@ -73,6 +73,7 @@ The following table lists the configurable parameters of the emqx chart and thei | `service.nodePorts.ws` | Kubernetes node port for WebSocket/HTTP. | nil | | `service.nodePorts.wss` | Kubernetes node port for WSS/HTTPS. | nil | | `service.nodePorts.dashboard` | Kubernetes node port for dashboard. | nil | +| `service.loadBalancerClass` | The load balancer implementation this Service belongs to | | | `service.loadBalancerIP` | loadBalancerIP for Service | nil | | `service.loadBalancerSourceRanges` | Address(es) that are allowed when service is LoadBalancer | [] | | `service.externalIPs` | ExternalIPs for the service | [] | diff --git a/deploy/charts/emqx-enterprise/templates/service.yaml b/deploy/charts/emqx-enterprise/templates/service.yaml index dea548653..525390a90 100644 --- a/deploy/charts/emqx-enterprise/templates/service.yaml +++ b/deploy/charts/emqx-enterprise/templates/service.yaml @@ -18,6 +18,9 @@ spec: externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy | default "Cluster" }} {{- end }} {{- if eq .Values.service.type "LoadBalancer" }} + {{- if .Values.service.loadBalancerClass }} + loadBalancerClass: {{ .Values.service.loadBalancerClass }} + {{- end }} {{- if .Values.service.loadBalancerIP }} loadBalancerIP: {{ .Values.service.loadBalancerIP }} {{- end }} diff --git a/deploy/charts/emqx-enterprise/values.yaml b/deploy/charts/emqx-enterprise/values.yaml index 37fa56348..e830b81af 100644 --- a/deploy/charts/emqx-enterprise/values.yaml +++ b/deploy/charts/emqx-enterprise/values.yaml @@ -163,6 +163,10 @@ service: wss: dashboard: dashboardtls: + ## Specifies the load balancer implementation this Service belongs to. + ## Once set, it can not be changed. + ## + # loadBalancerClass: ## Set the LoadBalancer service type to internal only. ## ref: https://kubernetes.io/docs/concepts/services-networking/service/#internal-load-balancer ## @@ -245,7 +249,7 @@ ssl: useExisting: false existingName: emqx-tls dnsnames: [] - commonName: + commonName: issuer: name: letsencrypt-dns kind: ClusterIssuer diff --git a/deploy/charts/emqx/README.md b/deploy/charts/emqx/README.md index 0221f5114..d9e144f0c 100644 --- a/deploy/charts/emqx/README.md +++ b/deploy/charts/emqx/README.md @@ -74,6 +74,7 @@ The following table lists the configurable parameters of the emqx chart and thei | `service.nodePorts.ws` | Kubernetes node port for WebSocket/HTTP. | nil | | `service.nodePorts.wss` | Kubernetes node port for WSS/HTTPS. | nil | | `service.nodePorts.dashboard` | Kubernetes node port for dashboard. | nil | +| `service.loadBalancerClass` | The load balancer implementation this Service belongs to | | | `service.loadBalancerIP` | loadBalancerIP for Service | nil | | `service.loadBalancerSourceRanges` | Address(es) that are allowed when service is LoadBalancer | [] | | `service.externalIPs` | ExternalIPs for the service | [] | diff --git a/deploy/charts/emqx/templates/service.yaml b/deploy/charts/emqx/templates/service.yaml index dea548653..525390a90 100644 --- a/deploy/charts/emqx/templates/service.yaml +++ b/deploy/charts/emqx/templates/service.yaml @@ -18,6 +18,9 @@ spec: externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy | default "Cluster" }} {{- end }} {{- if eq .Values.service.type "LoadBalancer" }} + {{- if .Values.service.loadBalancerClass }} + loadBalancerClass: {{ .Values.service.loadBalancerClass }} + {{- end }} {{- if .Values.service.loadBalancerIP }} loadBalancerIP: {{ .Values.service.loadBalancerIP }} {{- end }} diff --git a/deploy/charts/emqx/values.yaml b/deploy/charts/emqx/values.yaml index 791db5812..88cc6279f 100644 --- a/deploy/charts/emqx/values.yaml +++ b/deploy/charts/emqx/values.yaml @@ -163,6 +163,10 @@ service: wss: dashboard: dashboardtls: + ## Specifies the load balancer implementation this Service belongs to. + ## Once set, it can not be changed. + ## + # loadBalancerClass: ## Set the LoadBalancer service type to internal only. ## ref: https://kubernetes.io/docs/concepts/services-networking/service/#internal-load-balancer ## @@ -245,7 +249,7 @@ ssl: useExisting: false existingName: emqx-tls dnsnames: [] - commonName: + commonName: issuer: name: letsencrypt-dns kind: ClusterIssuer From d911f7fbeaa03ba361554db2a9e2e85a9c7323ba Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Wed, 30 Aug 2023 11:12:33 +0200 Subject: [PATCH 61/85] fix: update gpb library to fix type error This fixes a bug in the protobuf schema registry functionality. Before this fix one would get a badarith error if one tried to assign a float value to an uint64 field. However, this commit fixes this by upgrading gpb so we instead will get a gpb_type_error which is what we want. Fixes: https://emqx.atlassian.net/browse/EMQX-10775 --- apps/emqx_schema_registry/rebar.config | 2 +- .../test/emqx_schema_registry_SUITE.erl | 15 ++++++++++++++- mix.exs | 2 +- rebar.config | 2 +- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/apps/emqx_schema_registry/rebar.config b/apps/emqx_schema_registry/rebar.config index 9924100bc..d604b89d9 100644 --- a/apps/emqx_schema_registry/rebar.config +++ b/apps/emqx_schema_registry/rebar.config @@ -6,7 +6,7 @@ {emqx_utils, {path, "../emqx_utils"}}, {emqx_rule_engine, {path, "../emqx_rule_engine"}}, {erlavro, {git, "https://github.com/klarna/erlavro.git", {tag, "2.9.8"}}}, - {gpb, "4.19.7"} + {gpb, "4.19.9"} ]}. {shell, [ diff --git a/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl b/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl index 0db2ea3c6..e2a696428 100644 --- a/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl +++ b/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl @@ -42,7 +42,8 @@ sparkplug_tests() -> [ t_sparkplug_decode, t_sparkplug_encode, - t_sparkplug_decode_encode_with_message_name + t_sparkplug_decode_encode_with_message_name, + t_sparkplug_encode_float_to_uint64_key ]. init_per_suite(Config) -> @@ -847,6 +848,18 @@ t_sparkplug_encode(_Config) -> ?assertMatch(#{data := ExpectedRuleOutput}, Res), ok. +t_sparkplug_encode_float_to_uint64_key(_Config) -> + %% Test that the following bug is fixed: + %% https://emqx.atlassian.net/browse/EMQX-10775 + %% When one assign a float value to a uint64 key, one should get a + %% gpb_type_error and not a badarith error + wait_for_sparkplug_schema_registered(), + ?assertException( + error, + {gpb_type_error, _}, + emqx_rule_funcs:sparkplug_encode(#{<<"seq">> => 1.5}) + ). + t_sparkplug_decode_encode_with_message_name(_Config) -> SQL = << diff --git a/mix.exs b/mix.exs index d57cedc2b..585abd230 100644 --- a/mix.exs +++ b/mix.exs @@ -93,7 +93,7 @@ defmodule EMQXUmbrella.MixProject do # in conflict by cowboy_swagger and cowboy {:ranch, github: "emqx/ranch", tag: "1.8.1-emqx", override: true}, # in conflict by grpc and eetcd - {:gpb, "4.19.7", override: true, runtime: false}, + {:gpb, "4.19.9", override: true, runtime: false}, {:hackney, github: "emqx/hackney", tag: "1.18.1-1", override: true}, # set by hackney (dependency) {:ssl_verify_fun, "1.1.6", override: true}, diff --git a/rebar.config b/rebar.config index 2d605c18f..197d06a67 100644 --- a/rebar.config +++ b/rebar.config @@ -53,7 +53,7 @@ [ {lc, {git, "https://github.com/emqx/lc.git", {tag, "0.3.2"}}} , {redbug, "2.0.8"} , {covertool, {git, "https://github.com/zmstone/covertool", {tag, "2.0.4.1"}}} - , {gpb, "4.19.7"} + , {gpb, "4.19.9"} , {typerefl, {git, "https://github.com/ieQu1/typerefl", {tag, "0.9.1"}}} , {gun, {git, "https://github.com/emqx/gun", {tag, "1.3.9"}}} , {ehttpc, {git, "https://github.com/emqx/ehttpc", {tag, "0.4.11"}}} From 36d44ca5b74c0266893e8f567a802039e91d0145 Mon Sep 17 00:00:00 2001 From: Kjell Winblad Date: Wed, 30 Aug 2023 11:29:29 +0200 Subject: [PATCH 62/85] docs: add changelog entry for gpb upgrade --- changes/ee/fix-11542.en.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/ee/fix-11542.en.md diff --git a/changes/ee/fix-11542.en.md b/changes/ee/fix-11542.en.md new file mode 100644 index 000000000..0100677bf --- /dev/null +++ b/changes/ee/fix-11542.en.md @@ -0,0 +1 @@ +Enhanced Google ProtoBuf schema registry support: Now, when assigning a float to an integer using the rule engine functions `schema_encode` or `sparkplug_encode`, a `gpb_type_error` will be raised instead of the previous `badarith` error. From 95a51658e1668444e7314ddea5e8cfdb4bb5a3dc Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 28 Aug 2023 15:28:56 -0300 Subject: [PATCH 63/85] ci: format integration test directory --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fc9286837..435e4958a 100644 --- a/Makefile +++ b/Makefile @@ -296,7 +296,7 @@ $(foreach tt,$(ALL_ELIXIR_TGZS),$(eval $(call gen-elixir-tgz-target,$(tt)))) .PHONY: fmt fmt: $(REBAR) - @$(SCRIPTS)/erlfmt -w '{apps,lib-ee}/*/{src,include,priv,test}/**/*.{erl,hrl,app.src,eterm}' + @$(SCRIPTS)/erlfmt -w '{apps,lib-ee}/*/{src,include,priv,test,integration_test}/**/*.{erl,hrl,app.src,eterm}' @$(SCRIPTS)/erlfmt -w 'rebar.config.erl' @mix format From 9c6dd30f44a0d36ce12d4471bc3e6fb8e8cc4d70 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 30 Aug 2023 11:04:05 -0300 Subject: [PATCH 64/85] feat(session): store iterator ids in session record --- apps/emqx/include/emqx_session.hrl | 6 +++++- apps/emqx/integration_test/emqx_ds_SUITE.erl | 14 ++++++++++++-- apps/emqx/src/emqx_session.erl | 13 ++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/apps/emqx/include/emqx_session.hrl b/apps/emqx/include/emqx_session.hrl index 3fea157ed..304f92d58 100644 --- a/apps/emqx/include/emqx_session.hrl +++ b/apps/emqx/include/emqx_session.hrl @@ -49,7 +49,11 @@ %% Awaiting PUBREL Timeout (Unit: millisecond) await_rel_timeout :: timeout(), %% Created at - created_at :: pos_integer() + created_at :: pos_integer(), + %% Topic filter to iterator ID mapping. + %% Note: we shouldn't serialize this when persisting sessions, as this information + %% also exists in the `?ITERATOR_REF_TAB' table. + iterators = #{} :: #{emqx_topic:topic() => emqx_ds:iterator_id()} }). -endif. diff --git a/apps/emqx/integration_test/emqx_ds_SUITE.erl b/apps/emqx/integration_test/emqx_ds_SUITE.erl index 3f0d3f3e4..8fdbf7fb4 100644 --- a/apps/emqx/integration_test/emqx_ds_SUITE.erl +++ b/apps/emqx/integration_test/emqx_ds_SUITE.erl @@ -97,6 +97,12 @@ get_all_iterator_ids(Node) -> emqx_ds_storage_layer:foldl_iterator_prefix(?DS_SHARD, <<>>, Fn, []) end). +get_session_iterators(Node, ClientId) -> + erpc:call(Node, fun() -> + [ConnPid] = emqx_cm:lookup_channels(ClientId), + emqx_connection:info({channel, {session, iterators}}, sys:get_state(ConnPid)) + end). + wait_nodeup(Node) -> ?retry( _Sleep0 = 500, @@ -191,14 +197,18 @@ t_session_subscription_idempotency(Config) -> {ok, _} = emqtt:connect(Client1), ct:pal("subscribing 2"), {ok, _, [2]} = emqtt:subscribe(Client1, SubTopicFilter, qos2), + SessionIterators = get_session_iterators(Node1, ClientId), ok = emqtt:stop(Client1), - ok + #{session_iterators => SessionIterators} end, - fun(Trace) -> + fun(Res, Trace) -> ct:pal("trace:\n ~p", [Trace]), + #{session_iterators := SessionIterators} = Res, %% Exactly one iterator should have been opened. + ?assertEqual(1, map_size(SessionIterators), #{iterators => SessionIterators}), + ?assertMatch(#{SubTopicFilter := _}, SessionIterators), ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), ?assertMatch( {_IsNew = false, ClientId}, diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 0c051f002..8c1df9d06 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -269,7 +269,9 @@ info(awaiting_rel_max, #session{max_awaiting_rel = Max}) -> info(await_rel_timeout, #session{await_rel_timeout = Timeout}) -> Timeout; info(created_at, #session{created_at = CreatedAt}) -> - CreatedAt. + CreatedAt; +info(iterators, #session{iterators = Iterators}) -> + Iterators. %% @doc Get stats of the session. -spec stats(session()) -> emqx_types:stats(). @@ -318,8 +320,13 @@ is_subscriptions_full(#session{ -spec add_persistent_subscription(emqx_types:topic(), emqx_types:clientid(), session()) -> session(). add_persistent_subscription(TopicFilterBin, ClientId, Session) -> - _ = emqx_persistent_session_ds:add_subscription(TopicFilterBin, ClientId), - Session. + case emqx_persistent_session_ds:add_subscription(TopicFilterBin, ClientId) of + {ok, IteratorId, _IsNew} -> + Iterators = Session#session.iterators, + Session#session{iterators = Iterators#{TopicFilterBin => IteratorId}}; + _ -> + Session + end. %%-------------------------------------------------------------------- %% Client -> Broker: UNSUBSCRIBE From 922ca5e14111d7070fe65d27e13780ec69fea493 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Wed, 30 Aug 2023 11:13:19 -0300 Subject: [PATCH 65/85] feat(ds): close iterators when handling `UNSUBSCRIBE` packets Fixes https://emqx.atlassian.net/browse/EMQX-9742 --- apps/emqx/integration_test/emqx_ds_SUITE.erl | 126 +++++++++++++++++- apps/emqx/src/emqx_persistent_session_ds.erl | 72 +++++++++- apps/emqx/src/emqx_session.erl | 15 ++- .../emqx_persistent_session_ds_proto_v1.erl | 32 ++++- apps/emqx_durable_storage/src/emqx_ds.erl | 30 +++-- 5 files changed, 258 insertions(+), 17 deletions(-) diff --git a/apps/emqx/integration_test/emqx_ds_SUITE.erl b/apps/emqx/integration_test/emqx_ds_SUITE.erl index 8fdbf7fb4..cbfa5c185 100644 --- a/apps/emqx/integration_test/emqx_ds_SUITE.erl +++ b/apps/emqx/integration_test/emqx_ds_SUITE.erl @@ -9,8 +9,10 @@ -include_lib("stdlib/include/assert.hrl"). -include_lib("common_test/include/ct.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). -define(DS_SHARD, <<"local">>). +-define(ITERATOR_REF_TAB, emqx_ds_iterator_ref). %%------------------------------------------------------------------------------ %% CT boilerplate @@ -31,9 +33,12 @@ end_per_suite(Config) -> emqx_cth_suite:stop(TCApps), ok. -init_per_testcase(t_session_subscription_idempotency = TC, Config) -> +init_per_testcase(TestCase, Config) when + TestCase =:= t_session_subscription_idempotency; + TestCase =:= t_session_unsubscription_idempotency +-> Cluster = cluster(#{n => 1}), - ClusterOpts = #{work_dir => emqx_cth_suite:work_dir(TC, Config)}, + ClusterOpts = #{work_dir => emqx_cth_suite:work_dir(TestCase, Config)}, NodeSpecs = emqx_cth_cluster:mk_nodespecs(Cluster, ClusterOpts), Nodes = emqx_cth_cluster:start(Cluster, ClusterOpts), [ @@ -46,7 +51,10 @@ init_per_testcase(t_session_subscription_idempotency = TC, Config) -> init_per_testcase(_TestCase, Config) -> Config. -end_per_testcase(t_session_subscription_idempotency, Config) -> +end_per_testcase(TestCase, Config) when + TestCase =:= t_session_subscription_idempotency; + TestCase =:= t_session_unsubscription_idempotency +-> Nodes = ?config(nodes, Config), ok = emqx_cth_cluster:stop(Nodes), ok; @@ -91,6 +99,9 @@ get_mqtt_port(Node, Type) -> {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]), Port. +get_all_iterator_refs(Node) -> + erpc:call(Node, mnesia, dirty_all_keys, [?ITERATOR_REF_TAB]). + get_all_iterator_ids(Node) -> Fn = fun(K, _V, Acc) -> [K | Acc] end, erpc:call(Node, fun() -> @@ -165,6 +176,7 @@ t_session_subscription_idempotency(Config) -> %% have to re-inject this so that we may stop the node succesfully at the %% end.... ok = emqx_cth_cluster:set_node_opts(Node1, Node1Spec), + ok = snabbkaffe:forward_trace(Node1), ct:pal("node ~p restarted", [Node1]), ?tp(restarted_node, #{}), ok @@ -209,6 +221,8 @@ t_session_subscription_idempotency(Config) -> %% Exactly one iterator should have been opened. ?assertEqual(1, map_size(SessionIterators), #{iterators => SessionIterators}), ?assertMatch(#{SubTopicFilter := _}, SessionIterators), + SubTopicFilterWords = emqx_topic:words(SubTopicFilter), + ?assertEqual([{ClientId, SubTopicFilterWords}], get_all_iterator_refs(Node1)), ?assertMatch({ok, [_]}, get_all_iterator_ids(Node1)), ?assertMatch( {_IsNew = false, ClientId}, @@ -218,3 +232,109 @@ t_session_subscription_idempotency(Config) -> end ), ok. + +%% Check that we close the iterators before deleting the iterator id entry. +t_session_unsubscription_idempotency(Config) -> + [Node1Spec | _] = ?config(node_specs, Config), + [Node1] = ?config(nodes, Config), + Port = get_mqtt_port(Node1, tcp), + SubTopicFilter = <<"t/+">>, + ClientId = <<"myclientid">>, + ?check_trace( + begin + ?force_ordering( + #{?snk_kind := persistent_session_ds_close_iterators, ?snk_span := {complete, _}}, + _NEvents0 = 1, + #{?snk_kind := will_restart_node}, + _Guard0 = true + ), + ?force_ordering( + #{?snk_kind := restarted_node}, + _NEvents1 = 1, + #{?snk_kind := persistent_session_ds_iterator_delete, ?snk_span := start}, + _Guard1 = true + ), + + spawn_link(fun() -> + ?tp(will_restart_node, #{}), + ct:pal("restarting node ~p", [Node1]), + true = monitor_node(Node1, true), + ok = erpc:call(Node1, init, restart, []), + receive + {nodedown, Node1} -> + ok + after 10_000 -> + ct:fail("node ~p didn't stop", [Node1]) + end, + ct:pal("waiting for nodeup ~p", [Node1]), + wait_nodeup(Node1), + wait_gen_rpc_down(Node1Spec), + ct:pal("restarting apps on ~p", [Node1]), + Apps = maps:get(apps, Node1Spec), + ok = erpc:call(Node1, emqx_cth_suite, load_apps, [Apps]), + _ = erpc:call(Node1, emqx_cth_suite, start_apps, [Apps, Node1Spec]), + %% have to re-inject this so that we may stop the node succesfully at the + %% end.... + ok = emqx_cth_cluster:set_node_opts(Node1, Node1Spec), + ok = snabbkaffe:forward_trace(Node1), + ct:pal("node ~p restarted", [Node1]), + ?tp(restarted_node, #{}), + ok + end), + + ct:pal("starting 1"), + {ok, Client0} = emqtt:start_link([ + {port, Port}, + {clientid, ClientId}, + {proto_ver, v5} + ]), + {ok, _} = emqtt:connect(Client0), + ct:pal("subscribing 1"), + {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(Client0, SubTopicFilter, qos2), + ct:pal("unsubscribing 1"), + process_flag(trap_exit, true), + catch emqtt:unsubscribe(Client0, SubTopicFilter), + receive + {'EXIT', {shutdown, _}} -> + ok + after 0 -> ok + end, + process_flag(trap_exit, false), + + {ok, _} = ?block_until(#{?snk_kind := restarted_node}, 15_000), + ct:pal("starting 2"), + {ok, Client1} = emqtt:start_link([ + {port, Port}, + {clientid, ClientId}, + {proto_ver, v5} + ]), + {ok, _} = emqtt:connect(Client1), + ct:pal("subscribing 2"), + {ok, _, [?RC_GRANTED_QOS_2]} = emqtt:subscribe(Client1, SubTopicFilter, qos2), + ct:pal("unsubscribing 2"), + {{ok, _, [?RC_SUCCESS]}, {ok, _}} = + ?wait_async_action( + emqtt:unsubscribe(Client1, SubTopicFilter), + #{ + ?snk_kind := persistent_session_ds_iterator_delete, + ?snk_span := {complete, _} + }, + 15_000 + ), + SessionIterators = get_session_iterators(Node1, ClientId), + + ok = emqtt:stop(Client1), + + #{session_iterators => SessionIterators} + end, + fun(Res, Trace) -> + ct:pal("trace:\n ~p", [Trace]), + #{session_iterators := SessionIterators} = Res, + %% No iterators remaining + ?assertEqual(#{}, SessionIterators), + ?assertEqual([], get_all_iterator_refs(Node1)), + ?assertEqual({ok, []}, get_all_iterator_ids(Node1)), + ok + end + ), + ok. diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index dc615fd5b..13c35ed9b 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -23,7 +23,8 @@ -export([ persist_message/1, open_session/1, - add_subscription/2 + add_subscription/2, + del_subscription/3 ]). -export([ @@ -32,7 +33,15 @@ ]). %% RPC --export([do_open_iterator/3]). +-export([ + ensure_iterator_closed_on_all_shards/1, + ensure_all_iterators_closed/1 +]). +-export([ + do_open_iterator/3, + do_ensure_iterator_closed/1, + do_ensure_all_iterators_closed/1 +]). %% FIXME -define(DS_SHARD, <<"local">>). @@ -130,6 +139,62 @@ do_open_iterator(TopicFilter, StartMS, IteratorID) -> {ok, _It} = emqx_ds_storage_layer:ensure_iterator(?DS_SHARD, IteratorID, Replay), ok. +-spec del_subscription(emqx_ds:iterator_id() | undefined, emqx_types:topic(), emqx_ds:session_id()) -> + ok | {skipped, disabled}. +del_subscription(IteratorID, TopicFilterBin, DSSessionID) -> + ?WHEN_ENABLED( + begin + TopicFilter = emqx_topic:words(TopicFilterBin), + Ctx = #{iterator_id => IteratorID}, + case IteratorID of + undefined -> + ok; + _ -> + ?tp_span( + persistent_session_ds_close_iterators, + Ctx, + ok = ensure_iterator_closed_on_all_shards(IteratorID) + ) + end, + ?tp_span( + persistent_session_ds_iterator_delete, + Ctx, + emqx_ds:session_del_iterator(DSSessionID, TopicFilter) + ) + end + ). + +-spec ensure_iterator_closed_on_all_shards(emqx_ds:iterator_id()) -> ok. +ensure_iterator_closed_on_all_shards(IteratorID) -> + %% Note: currently, shards map 1:1 to nodes, but this will change in the future. + Nodes = emqx:running_nodes(), + Results = emqx_persistent_session_ds_proto_v1:close_iterator(Nodes, IteratorID), + %% TODO: handle errors + true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), + ok. + +%% RPC target. +-spec do_ensure_iterator_closed(emqx_ds:iterator_id()) -> ok. +do_ensure_iterator_closed(IteratorID) -> + ok = emqx_ds_storage_layer:discard_iterator(?DS_SHARD, IteratorID), + ok. + +-spec ensure_all_iterators_closed(emqx_ds:session_id()) -> ok. +ensure_all_iterators_closed(DSSessionID) -> + %% Note: currently, shards map 1:1 to nodes, but this will change in the future. + Nodes = emqx:running_nodes(), + Results = emqx_persistent_session_ds_proto_v1:close_all_iterators(Nodes, DSSessionID), + %% TODO: handle errors + true = lists:all(fun(Res) -> Res =:= {ok, ok} end, Results), + ok. + +%% RPC target. +-spec do_ensure_all_iterators_closed(emqx_ds:session_id()) -> ok. +do_ensure_all_iterators_closed(DSSessionID0) -> + DSSessionID = bin(DSSessionID0), + ok = emqx_ds_storage_layer:discard_iterator_prefix(?DS_SHARD, DSSessionID), + ok. + %% serialize_message(Msg) -> @@ -142,3 +207,6 @@ deserialize_message(Bin) -> is_store_enabled() -> emqx_config:get([persistent_session_store, ds]). + +bin(B) when is_binary(B) -> B; +bin(A) when is_atom(A) -> atom_to_binary(A, utf8). diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 8c1df9d06..1ff4d9b85 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -335,23 +335,32 @@ add_persistent_subscription(TopicFilterBin, ClientId, Session) -> -spec unsubscribe(emqx_types:clientinfo(), emqx_types:topic(), emqx_types:subopts(), session()) -> {ok, session()} | {error, emqx_types:reason_code()}. unsubscribe( - ClientInfo, + ClientInfo = #{clientid := ClientId}, TopicFilter, UnSubOpts, - Session = #session{subscriptions = Subs} + Session0 = #session{subscriptions = Subs} ) -> case maps:find(TopicFilter, Subs) of {ok, SubOpts} -> ok = emqx_broker:unsubscribe(TopicFilter), + Session1 = remove_persistent_subscription(Session0, TopicFilter, ClientId), ok = emqx_hooks:run( 'session.unsubscribed', [ClientInfo, TopicFilter, maps:merge(SubOpts, UnSubOpts)] ), - {ok, Session#session{subscriptions = maps:remove(TopicFilter, Subs)}}; + {ok, Session1#session{subscriptions = maps:remove(TopicFilter, Subs)}}; error -> {error, ?RC_NO_SUBSCRIPTION_EXISTED} end. +-spec remove_persistent_subscription(session(), emqx_types:topic(), emqx_types:clientid()) -> + session(). +remove_persistent_subscription(Session, TopicFilterBin, ClientId) -> + Iterators = Session#session.iterators, + IteratorId = maps:get(TopicFilterBin, Iterators, undefined), + _ = emqx_persistent_session_ds:del_subscription(IteratorId, TopicFilterBin, ClientId), + Session#session{iterators = maps:remove(TopicFilterBin, Iterators)}. + %%-------------------------------------------------------------------- %% Client -> Broker: PUBLISH %%-------------------------------------------------------------------- diff --git a/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl b/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl index cd348cc2c..b1926098d 100644 --- a/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl +++ b/apps/emqx/src/proto/emqx_persistent_session_ds_proto_v1.erl @@ -21,7 +21,9 @@ -export([ introduced_in/0, - open_iterator/4 + open_iterator/4, + close_iterator/2, + close_all_iterators/2 ]). -include_lib("emqx/include/bpapi.hrl"). @@ -47,3 +49,31 @@ open_iterator(Nodes, TopicFilter, StartMS, IteratorID) -> [TopicFilter, StartMS, IteratorID], ?TIMEOUT ). + +-spec close_iterator( + [node()], + emqx_ds:iterator_id() +) -> + emqx_rpc:erpc_multicall(ok). +close_iterator(Nodes, IteratorID) -> + erpc:multicall( + Nodes, + emqx_persistent_session_ds, + do_ensure_iterator_closed, + [IteratorID], + ?TIMEOUT + ). + +-spec close_all_iterators( + [node()], + emqx_ds:session_id() +) -> + emqx_rpc:erpc_multicall(ok). +close_all_iterators(Nodes, DSSessionID) -> + erpc:multicall( + Nodes, + emqx_persistent_session_ds, + do_ensure_all_iterators_closed, + [DSSessionID], + ?TIMEOUT + ). diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 889e7ea24..095a93745 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -30,6 +30,7 @@ session_drop/1, session_suspend/1, session_add_iterator/2, + session_get_iterator_id/2, session_del_iterator/2, session_stats/0 ]). @@ -156,6 +157,7 @@ session_drop(ClientID) -> {atomic, ok} = mria:transaction( ?DS_SHARD, fun() -> + %% TODO: ensure all iterators from this clientid are closed? mnesia:delete({?SESSION_TAB, ClientID}) end ), @@ -201,14 +203,26 @@ session_add_iterator(DSSessionId, TopicFilter) -> end), Res. -%% @doc Called when a client unsubscribes from a topic. Returns `true' -%% if the session contained the subscription or `false' if it wasn't -%% subscribed. --spec session_del_iterator(session_id(), emqx_topic:words()) -> - {ok, boolean()} | {error, session_not_found}. -session_del_iterator(_SessionId, _TopicFilter) -> - %% TODO - {ok, false}. +-spec session_get_iterator_id(session_id(), emqx_topic:words()) -> + {ok, iterator_id()} | {error, not_found}. +session_get_iterator_id(DSSessionId, TopicFilter) -> + IteratorRefId = {DSSessionId, TopicFilter}, + case mnesia:dirty_read(?ITERATOR_REF_TAB, IteratorRefId) of + [] -> + {error, not_found}; + [#iterator_ref{it_id = IteratorId}] -> + {ok, IteratorId} + end. + +%% @doc Called when a client unsubscribes from a topic. +-spec session_del_iterator(session_id(), emqx_topic:words()) -> ok. +session_del_iterator(DSSessionId, TopicFilter) -> + IteratorRefId = {DSSessionId, TopicFilter}, + {atomic, ok} = + mria:transaction(?DS_SHARD, fun() -> + mnesia:delete(?ITERATOR_REF_TAB, IteratorRefId, write) + end), + ok. -spec session_stats() -> #{}. session_stats() -> From 2da604dfa4507a87155f07572f47ce0bc5922190 Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 30 Aug 2023 20:24:18 +0400 Subject: [PATCH 66/85] test(mgmt): simplify flaky `t_verify_imported_mnesia_tab_on_cluster` And also rest of the test setup in the suite. --- .../test/emqx_mgmt_data_backup_SUITE.erl | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl index 381862995..04982a087 100644 --- a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl @@ -33,7 +33,7 @@ init_per_suite(Config) -> end_per_suite(_Config) -> ok. -init_per_testcase(TC = t_import_on_cluster, Config0) -> +init_per_testcase(TC = t_import_on_cluster, Config) -> %% Don't import listeners to avoid port conflicts %% when the same conf will be imported to another cluster meck:new(emqx_mgmt_listeners_conf, [passthrough]), @@ -50,17 +50,15 @@ init_per_testcase(TC = t_import_on_cluster, Config0) -> 1, {ok, #{changed => [], root_key => gateway}} ), - Config = [{tc_name, TC} | Config0], - [{cluster, cluster(Config)} | setup(Config)]; -init_per_testcase(TC = t_verify_imported_mnesia_tab_on_cluster, Config0) -> - Config = [{tc_name, TC} | Config0], - [{cluster, cluster(Config)} | setup(Config)]; + [{cluster, cluster(TC, Config)} | setup(TC, Config)]; +init_per_testcase(TC = t_verify_imported_mnesia_tab_on_cluster, Config) -> + [{cluster, cluster(TC, Config)} | setup(TC, Config)]; init_per_testcase(t_mnesia_bad_tab_schema, Config) -> meck:new(emqx_mgmt_data_backup, [passthrough]), meck:expect(TC = emqx_mgmt_data_backup, mnesia_tabs_to_backup, 0, [data_backup_test]), - setup([{tc_name, TC} | Config]); + setup(TC, Config); init_per_testcase(TC, Config) -> - setup([{tc_name, TC} | Config]). + setup(TC, Config). end_per_testcase(t_import_on_cluster, Config) -> emqx_cth_cluster:stop(?config(cluster, Config)), @@ -304,7 +302,7 @@ t_verify_imported_mnesia_tab_on_cluster(Config) -> {ok, Cwd} = file:get_cwd(), AbsFilePath = filename:join(Cwd, FileName), - [CoreNode1, CoreNode2, ReplicantNode] = NodesList = ?config(cluster, Config), + [CoreNode1, CoreNode2, ReplicantNode] = ?config(cluster, Config), [ {ok, _} = rpc:call(CoreNode1, emqx_dashboard_admin, add_user, [U, U, U]) @@ -328,10 +326,7 @@ t_verify_imported_mnesia_tab_on_cluster(Config) -> %% Give some extra time to replicant to import data... timer:sleep(3000), - ?assertEqual(AllUsers, lists:sort(rpc:call(ReplicantNode, mnesia, dirty_all_keys, [Tab]))), - - [rpc:call(N, ekka, leave, []) || N <- lists:reverse(NodesList)], - [emqx_common_test_helpers:stop_slave(N) || N <- NodesList]. + ?assertEqual(AllUsers, lists:sort(rpc:call(ReplicantNode, mnesia, dirty_all_keys, [Tab]))). t_mnesia_bad_tab_schema(_Config) -> OldAttributes = [id, name, description], @@ -386,8 +381,8 @@ t_read_files(_Config) -> %% Internal test helpers %%------------------------------------------------------------------------------ -setup(Config) -> - WorkDir = filename:join(work_dir(Config), local), +setup(TC, Config) -> + WorkDir = filename:join(emqx_cth_suite:work_dir(TC, Config), local), Started = emqx_cth_suite:start(apps_to_start(), #{work_dir => WorkDir}), [{suite_apps, Started} | Config]. @@ -408,20 +403,17 @@ recompose_version(MajorInt, MinorInt, Patch) -> [integer_to_list(MajorInt + 1), $., integer_to_list(MinorInt), $. | Patch] ). -cluster(Config) -> +cluster(TC, Config) -> Nodes = emqx_cth_cluster:start( [ {data_backup_core1, #{role => core, apps => apps_to_start()}}, {data_backup_core2, #{role => core, apps => apps_to_start()}}, {data_backup_replicant, #{role => replicant, apps => apps_to_start()}} ], - #{work_dir => work_dir(Config)} + #{work_dir => emqx_cth_suite:work_dir(TC, Config)} ), Nodes. -work_dir(Config) -> - filename:join(?config(priv_dir, Config), ?config(tc_name, Config)). - create_test_tab(Attributes) -> ok = mria:create_table(data_backup_test, [ {type, set}, From 254748b54e1f0c80b2e98c32691824e58421f744 Mon Sep 17 00:00:00 2001 From: firest Date: Fri, 1 Sep 2023 13:45:40 +0800 Subject: [PATCH 67/85] chore: comment mountpoints to support direct copy-paste used --- rel/config/examples/listeners.quic.conf.example | 2 +- rel/config/examples/listeners.tcp.conf.example | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rel/config/examples/listeners.quic.conf.example b/rel/config/examples/listeners.quic.conf.example index 52161e828..9ec2646a5 100644 --- a/rel/config/examples/listeners.quic.conf.example +++ b/rel/config/examples/listeners.quic.conf.example @@ -5,7 +5,7 @@ listeners.quic.my_quick_listener_name { bind = 14567 ## or with an IP, e.g. "127.0.0.1:14567" ## When publishing or subscribing, prefix all topics with a mountpoint string - mountpoint = "${clientid}/msg" + ## mountpoint = "${clientid}/msg" ## Client authentication ## Type: diff --git a/rel/config/examples/listeners.tcp.conf.example b/rel/config/examples/listeners.tcp.conf.example index f03d98cc2..7f4dcdfd7 100644 --- a/rel/config/examples/listeners.tcp.conf.example +++ b/rel/config/examples/listeners.tcp.conf.example @@ -11,7 +11,7 @@ listeners.tcp.my_tcp_listener_name { proxy_protocol_timeout = 8 ## When publishing or subscribing, prefix all topics with a mountpoint string - mountpoint = "mqtt" ## Do not set this unless you know what is it for + ## mountpoint = "mqtt" ## Do not set this unless you know what is it for ## Client authentication ## Type: From 53940754b7472e797097b9066a154b278fe570dc Mon Sep 17 00:00:00 2001 From: firest Date: Fri, 1 Sep 2023 14:17:33 +0800 Subject: [PATCH 68/85] chore: shutdown reason will never be a UTF8 atom --- apps/emqx_exhook/test/props/prop_exhook_hooks.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/emqx_exhook/test/props/prop_exhook_hooks.erl b/apps/emqx_exhook/test/props/prop_exhook_hooks.erl index 34d7a4342..2c9b5bb06 100644 --- a/apps/emqx_exhook/test/props/prop_exhook_hooks.erl +++ b/apps/emqx_exhook/test/props/prop_exhook_hooks.erl @@ -642,7 +642,7 @@ unsub_properties() -> #{}. shutdown_reason() -> - oneof([utf8(), {shutdown, emqx_proper_types:limited_atom()}]). + oneof([utf8(), {shutdown, emqx_proper_types:limited_latin_atom()}]). authresult() -> ?LET( From d4aba52e1683b72fee80ccaac8862ef718eaa7db Mon Sep 17 00:00:00 2001 From: firest Date: Fri, 1 Sep 2023 18:03:45 +0800 Subject: [PATCH 69/85] chore(dev): support the EMQX_NODE__NAME variable --- dev | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev b/dev index 20cd66569..7768fbcf6 100755 --- a/dev +++ b/dev @@ -43,7 +43,7 @@ OPTIONS: -c|--compile: Force recompile, otherwise starts with the already built libs in '_build/\$PROFILE/lib/'. -e|--ekka-epmd: Force to use ekka_epmd. - -n|--name: Node name, defaults to \$EMQX_NODE_NAME env. + -n|--name: Node name, defaults to \$EMQX_NODE__NAME or the \$EMQX_NODE_NAME env. ENVIRONMENT VARIABLES: @@ -63,7 +63,7 @@ export HOCON_ENV_OVERRIDE_PREFIX='EMQX_' export EMQX_LOG__FILE__DEFAULT__ENABLE='false' export EMQX_LOG__CONSOLE__ENABLE='true' SYSTEM="$(./scripts/get-distro.sh)" -EMQX_NODE_NAME="${EMQX_NODE_NAME:-emqx@127.0.0.1}" +EMQX_NODE_NAME="${EMQX_NODE__NAME:-${EMQX_NODE_NAME:-emqx@127.0.0.1}}" PROFILE="${PROFILE:-emqx}" FORCE_COMPILE=0 # Do not start using ekka epmd by default, so your IDE can connect to it @@ -158,7 +158,7 @@ export EMQX_LOG_DIR="$BASE_DIR/log" export EMQX_PLUGINS__INSTALL_DIR="${EMQX_PLUGINS__INSTALL_DIR:-$BASE_DIR/plugins}" CONFIGS_DIR="$EMQX_DATA_DIR/configs" # Use your cookie so your IDE can connect to it. -COOKIE="${EMQX_NODE__COOKIE:-${EMQX_NODE_COOKIE:-$(cat ~/.erlang.cookie || echo 'emqxsecretcookie')}}" +COOKIE="${EMQX_NODE__COOKIE:-${EMQX_NODE_COOKIE:-$(cat ~/.erlang.cookie 2>/dev/null || echo 'emqxsecretcookie')}}" mkdir -p "$EMQX_ETC_DIR" "$EMQX_DATA_DIR/patches" "$EMQX_DATA_DIR/plugins" "$EMQX_DATA_DIR/certs" "$EMQX_LOG_DIR" "$CONFIGS_DIR" if [ $EKKA_EPMD -eq 1 ]; then EPMD_ARGS='-start_epmd false -epmd_module ekka_epmd' From 1bf86250dd80763532e9a67f55e1058465ce0f7e Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Wed, 30 Aug 2023 20:56:28 +0300 Subject: [PATCH 70/85] fix(emqx_bridge_cassandra): allow cassandra bridge without username/password Cassandra can be used without credentials, if it is configured with AllowAllAuthenticator (default). --- .../src/emqx_bridge_cassandra.app.src | 2 +- .../src/emqx_bridge_cassandra_connector.erl | 23 +++++++++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.app.src b/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.app.src index de790ab46..26028e8ab 100644 --- a/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.app.src +++ b/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra.app.src @@ -1,6 +1,6 @@ {application, emqx_bridge_cassandra, [ {description, "EMQX Enterprise Cassandra Bridge"}, - {vsn, "0.1.3"}, + {vsn, "0.1.4"}, {registered, []}, {applications, [ kernel, diff --git a/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra_connector.erl b/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra_connector.erl index 2cbf0d6fe..0610ee743 100644 --- a/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra_connector.erl +++ b/apps/emqx_bridge_cassandra/src/emqx_bridge_cassandra_connector.erl @@ -94,7 +94,6 @@ on_start( #{ servers := Servers0, keyspace := Keyspace, - username := Username, pool_size := PoolSize, ssl := SSL } = Config @@ -114,12 +113,12 @@ on_start( Options = [ {nodes, Servers}, - {username, Username}, - {password, emqx_secret:wrap(maps:get(password, Config, ""))}, {keyspace, Keyspace}, {auto_reconnect, ?AUTO_RECONNECT_INTERVAL}, {pool_size, PoolSize} ], + Options1 = maybe_add_opt(username, Config, Options), + Options2 = maybe_add_opt(password, Config, Options1, _IsSensitive = true), SslOpts = case maps:get(enable, SSL) of @@ -132,7 +131,7 @@ on_start( [] end, State = parse_prepare_cql(Config), - case emqx_resource_pool:start(InstId, ?MODULE, Options ++ SslOpts) of + case emqx_resource_pool:start(InstId, ?MODULE, Options2 ++ SslOpts) of ok -> {ok, init_prepare(State#{pool_name => InstId, prepare_statement => #{}})}; {error, Reason} -> @@ -513,3 +512,19 @@ maybe_assign_type(V) when is_integer(V) -> maybe_assign_type(V) when is_float(V) -> {double, V}; maybe_assign_type(V) -> V. + +maybe_add_opt(Key, Conf, Opts) -> + maybe_add_opt(Key, Conf, Opts, _IsSensitive = false). + +maybe_add_opt(Key, Conf, Opts, IsSensitive) -> + case Conf of + #{Key := Val} -> + [{Key, maybe_wrap(IsSensitive, Val)} | Opts]; + _ -> + Opts + end. + +maybe_wrap(false = _IsSensitive, Val) -> + Val; +maybe_wrap(true, Val) -> + emqx_secret:wrap(Val). From c54527857a076342f3fb45d974e7561d30c4ef8c Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 31 Aug 2023 16:32:19 +0300 Subject: [PATCH 71/85] fix(emqx_bridge_sqlserver): use empty password by default as it is not a required field --- apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.app.src | 2 +- .../src/emqx_bridge_sqlserver_connector.erl | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.app.src b/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.app.src index 3aa8b3b68..530578d73 100644 --- a/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.app.src +++ b/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver.app.src @@ -1,6 +1,6 @@ {application, emqx_bridge_sqlserver, [ {description, "EMQX Enterprise SQL Server Bridge"}, - {vsn, "0.1.2"}, + {vsn, "0.1.3"}, {registered, []}, {applications, [kernel, stdlib, emqx_resource, odbc]}, {env, []}, diff --git a/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver_connector.erl b/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver_connector.erl index b6ad15ab9..3e1abbeba 100644 --- a/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver_connector.erl +++ b/apps/emqx_bridge_sqlserver/src/emqx_bridge_sqlserver_connector.erl @@ -173,7 +173,6 @@ on_start( #{ server := Server, username := Username, - password := Password, driver := Driver, database := Database, pool_size := PoolSize, @@ -200,7 +199,7 @@ on_start( Options = [ {server, to_bin(Server)}, {username, Username}, - {password, Password}, + {password, emqx_secret:wrap(maps:get(password, Config, ""))}, {driver, Driver}, {database, Database}, {pool_size, PoolSize} @@ -320,7 +319,7 @@ conn_str([{database, Database} | Opts], Acc) -> conn_str([{username, Username} | Opts], Acc) -> conn_str(Opts, ["UID=" ++ str(Username) | Acc]); conn_str([{password, Password} | Opts], Acc) -> - conn_str(Opts, ["PWD=" ++ str(Password) | Acc]); + conn_str(Opts, ["PWD=" ++ str(emqx_secret:unwrap(Password)) | Acc]); conn_str([{_, _} | Opts], Acc) -> conn_str(Opts, Acc). From 90156befb5f503817fe9ed68ea59d5c1e1e9b7cc Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Mon, 21 Aug 2023 18:39:43 +0300 Subject: [PATCH 72/85] feat: improve bcrypt usability * limit salt rounds to usable values * update bcrypt library to enable concurrent bcrypt hash calculation --- .../src/emqx_authn_password_hashing.erl | 19 +++++++++++++++---- changes/ce/feat-11487.en.md | 2 ++ mix.exs | 2 +- rebar.config.erl | 2 +- 4 files changed, 19 insertions(+), 6 deletions(-) create mode 100644 changes/ce/feat-11487.en.md diff --git a/apps/emqx_authn/src/emqx_authn_password_hashing.erl b/apps/emqx_authn/src/emqx_authn_password_hashing.erl index 4954cd66e..66bc6bfc6 100644 --- a/apps/emqx_authn/src/emqx_authn_password_hashing.erl +++ b/apps/emqx_authn/src/emqx_authn_password_hashing.erl @@ -63,6 +63,9 @@ check_password/4 ]). +-define(SALT_ROUNDS_MIN, 5). +-define(SALT_ROUNDS_MAX, 10). + namespace() -> "authn-hash". roots() -> [pbkdf2, bcrypt, bcrypt_rw, simple]. @@ -71,11 +74,12 @@ fields(bcrypt_rw) -> [ {salt_rounds, sc( - integer(), + range(?SALT_ROUNDS_MIN, ?SALT_ROUNDS_MAX), #{ - default => 10, - example => 10, - desc => "Salt rounds for BCRYPT password generation." + default => ?SALT_ROUNDS_MAX, + example => ?SALT_ROUNDS_MAX, + desc => "Work factor for BCRYPT password generation.", + converter => fun salt_rounds_converter/2 } )} ]; @@ -106,6 +110,13 @@ fields(simple) -> {salt_position, fun salt_position/1} ]. +salt_rounds_converter(undefined, _) -> + undefined; +salt_rounds_converter(I, _) when is_integer(I) -> + emqx_utils:clamp(I, ?SALT_ROUNDS_MIN, ?SALT_ROUNDS_MAX); +salt_rounds_converter(X, _) -> + X. + desc(bcrypt_rw) -> "Settings for bcrypt password hashing algorithm (for DB backends with write capability)."; desc(bcrypt) -> diff --git a/changes/ce/feat-11487.en.md b/changes/ce/feat-11487.en.md new file mode 100644 index 000000000..352a11c06 --- /dev/null +++ b/changes/ce/feat-11487.en.md @@ -0,0 +1,2 @@ +The bcrypt work factor is limited to the range 5-10, because higher values consume too much CPU resources. +Bcrypt library is updated to allow parallel hash evaluation. diff --git a/mix.exs b/mix.exs index d57cedc2b..8eb83b291 100644 --- a/mix.exs +++ b/mix.exs @@ -815,7 +815,7 @@ defmodule EMQXUmbrella.MixProject do defp bcrypt_dep() do if enable_bcrypt?(), - do: [{:bcrypt, github: "emqx/erlang-bcrypt", tag: "0.6.0", override: true}], + do: [{:bcrypt, github: "emqx/erlang-bcrypt", tag: "0.6.1", override: true}], else: [] end diff --git a/rebar.config.erl b/rebar.config.erl index ad6f425a0..6e1c64a40 100644 --- a/rebar.config.erl +++ b/rebar.config.erl @@ -36,7 +36,7 @@ assert_otp() -> end. bcrypt() -> - {bcrypt, {git, "https://github.com/emqx/erlang-bcrypt.git", {tag, "0.6.0"}}}. + {bcrypt, {git, "https://github.com/emqx/erlang-bcrypt.git", {tag, "0.6.1"}}}. quicer() -> {quicer, {git, "https://github.com/emqx/quic.git", {tag, "0.0.114"}}}. From c3b346017373648deb8f03f1c7cb84fd5593c092 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Thu, 31 Aug 2023 16:57:44 +0300 Subject: [PATCH 73/85] fix(emqx_oracle): make username a required field in emqx_oracle_schema --- apps/emqx_oracle/src/emqx_oracle.app.src | 2 +- apps/emqx_oracle/src/emqx_oracle_schema.erl | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/apps/emqx_oracle/src/emqx_oracle.app.src b/apps/emqx_oracle/src/emqx_oracle.app.src index e2d6d856f..c30e6be95 100644 --- a/apps/emqx_oracle/src/emqx_oracle.app.src +++ b/apps/emqx_oracle/src/emqx_oracle.app.src @@ -1,6 +1,6 @@ {application, emqx_oracle, [ {description, "EMQX Enterprise Oracle Database Connector"}, - {vsn, "0.1.5"}, + {vsn, "0.1.6"}, {registered, []}, {applications, [ kernel, diff --git a/apps/emqx_oracle/src/emqx_oracle_schema.erl b/apps/emqx_oracle/src/emqx_oracle_schema.erl index 3adde5af3..ba9904f19 100644 --- a/apps/emqx_oracle/src/emqx_oracle_schema.erl +++ b/apps/emqx_oracle/src/emqx_oracle_schema.erl @@ -21,7 +21,7 @@ roots() -> fields(config) -> Fields = [{server, server()}, {sid, fun sid/1}, {service_name, fun service_name/1}] ++ - emqx_connector_schema_lib:relational_db_fields() ++ + adjust_fields(emqx_connector_schema_lib:relational_db_fields()) ++ emqx_connector_schema_lib:prepare_statement_fields(), proplists:delete(database, Fields). @@ -38,3 +38,16 @@ service_name(type) -> binary(); service_name(desc) -> ?DESC(?REF_MODULE, "service_name"); service_name(required) -> false; service_name(_) -> undefined. + +adjust_fields(Fields) -> + lists:map( + fun + ({username, Sc}) -> + %% to please dialyzer... + Override = #{type => hocon_schema:field_schema(Sc, type), required => true}, + {username, hocon_schema:override(Sc, Override)}; + (Field) -> + Field + end, + Fields + ). From f0c75d97e13f7124bd76844bee099d433062a4c2 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 1 Sep 2023 13:21:37 +0300 Subject: [PATCH 74/85] chore: upgrade emqx_http_lib to 0.5.3 0.5.3 release includes a fix to parse : URLs using the default http scheme. --- apps/emqx/rebar.config | 2 +- mix.exs | 2 +- rebar.config | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/emqx/rebar.config b/apps/emqx/rebar.config index 730155805..eb0465db0 100644 --- a/apps/emqx/rebar.config +++ b/apps/emqx/rebar.config @@ -31,7 +31,7 @@ {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.15.10"}}}, {gen_rpc, {git, "https://github.com/emqx/gen_rpc", {tag, "2.8.1"}}}, {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.39.16"}}}, - {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.2"}}}, + {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.3"}}}, {pbkdf2, {git, "https://github.com/emqx/erlang-pbkdf2.git", {tag, "2.0.4"}}}, {recon, {git, "https://github.com/ferd/recon", {tag, "2.5.1"}}}, {snabbkaffe, {git, "https://github.com/kafka4beam/snabbkaffe.git", {tag, "1.0.8"}}} diff --git a/mix.exs b/mix.exs index 585abd230..bc8d93f4b 100644 --- a/mix.exs +++ b/mix.exs @@ -73,7 +73,7 @@ defmodule EMQXUmbrella.MixProject do {:getopt, "1.0.2", override: true}, {:snabbkaffe, github: "kafka4beam/snabbkaffe", tag: "1.0.8", override: true}, {:hocon, github: "emqx/hocon", tag: "0.39.16", override: true}, - {:emqx_http_lib, github: "emqx/emqx_http_lib", tag: "0.5.2", override: true}, + {:emqx_http_lib, github: "emqx/emqx_http_lib", tag: "0.5.3", override: true}, {:esasl, github: "emqx/esasl", tag: "0.2.0"}, {:jose, github: "potatosalad/erlang-jose", tag: "1.11.2"}, # in conflict by ehttpc and emqtt diff --git a/rebar.config b/rebar.config index 197d06a67..dae8319e0 100644 --- a/rebar.config +++ b/rebar.config @@ -76,7 +76,7 @@ , {getopt, "1.0.2"} , {snabbkaffe, {git, "https://github.com/kafka4beam/snabbkaffe.git", {tag, "1.0.8"}}} , {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.39.16"}}} - , {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.2"}}} + , {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.3"}}} , {esasl, {git, "https://github.com/emqx/esasl", {tag, "0.2.0"}}} , {jose, {git, "https://github.com/potatosalad/erlang-jose", {tag, "1.11.2"}}} , {telemetry, "1.1.0"} From 607705518b5449cb6ac408ab58e71d3bbbc9279f Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 1 Sep 2023 16:55:53 +0300 Subject: [PATCH 75/85] test(emqx_bridge_cassandra): add connector test case for Cassandra configured without authentication --- .ci/docker-compose-file/cassandra/Dockerfile | 4 - .../cassandra/cassandra_noauth.yaml | 1236 +++++++++++++++++ .../docker-compose-cassandra.yaml | 56 +- .../emqx_bridge_cassandra_connector_SUITE.erl | 71 +- 4 files changed, 1314 insertions(+), 53 deletions(-) delete mode 100644 .ci/docker-compose-file/cassandra/Dockerfile create mode 100644 .ci/docker-compose-file/cassandra/cassandra_noauth.yaml diff --git a/.ci/docker-compose-file/cassandra/Dockerfile b/.ci/docker-compose-file/cassandra/Dockerfile deleted file mode 100644 index f974c1b6f..000000000 --- a/.ci/docker-compose-file/cassandra/Dockerfile +++ /dev/null @@ -1,4 +0,0 @@ -ARG CASSANDRA_TAG=3.11.6 -FROM cassandra:${CASSANDRA_TAG} -COPY cassandra.yaml /etc/cassandra/cassandra.yaml -CMD ["cassandra", "-f"] diff --git a/.ci/docker-compose-file/cassandra/cassandra_noauth.yaml b/.ci/docker-compose-file/cassandra/cassandra_noauth.yaml new file mode 100644 index 000000000..eff87061d --- /dev/null +++ b/.ci/docker-compose-file/cassandra/cassandra_noauth.yaml @@ -0,0 +1,1236 @@ +# Cassandra storage config YAML + +# NOTE: +# See http://wiki.apache.org/cassandra/StorageConfiguration for +# full explanations of configuration directives +# /NOTE + +# The name of the cluster. This is mainly used to prevent machines in +# one logical cluster from joining another. +cluster_name: 'Test Cluster' + +# This defines the number of tokens randomly assigned to this node on the ring +# The more tokens, relative to other nodes, the larger the proportion of data +# that this node will store. You probably want all nodes to have the same number +# of tokens assuming they have equal hardware capability. +# +# If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility, +# and will use the initial_token as described below. +# +# Specifying initial_token will override this setting on the node's initial start, +# on subsequent starts, this setting will apply even if initial token is set. +# +# If you already have a cluster with 1 token per node, and wish to migrate to +# multiple tokens per node, see http://wiki.apache.org/cassandra/Operations +num_tokens: 256 + +# Triggers automatic allocation of num_tokens tokens for this node. The allocation +# algorithm attempts to choose tokens in a way that optimizes replicated load over +# the nodes in the datacenter for the replication strategy used by the specified +# keyspace. +# +# The load assigned to each node will be close to proportional to its number of +# vnodes. +# +# Only supported with the Murmur3Partitioner. +# allocate_tokens_for_keyspace: KEYSPACE + +# initial_token allows you to specify tokens manually. While you can use it with +# vnodes (num_tokens > 1, above) -- in which case you should provide a +# comma-separated list -- it's primarily used when adding nodes to legacy clusters +# that do not have vnodes enabled. +# initial_token: + +# See http://wiki.apache.org/cassandra/HintedHandoff +# May either be "true" or "false" to enable globally +hinted_handoff_enabled: true + +# When hinted_handoff_enabled is true, a black list of data centers that will not +# perform hinted handoff +# hinted_handoff_disabled_datacenters: +# - DC1 +# - DC2 + +# this defines the maximum amount of time a dead host will have hints +# generated. After it has been dead this long, new hints for it will not be +# created until it has been seen alive and gone down again. +max_hint_window_in_ms: 10800000 # 3 hours + +# Maximum throttle in KBs per second, per delivery thread. This will be +# reduced proportionally to the number of nodes in the cluster. (If there +# are two nodes in the cluster, each delivery thread will use the maximum +# rate; if there are three, each will throttle to half of the maximum, +# since we expect two nodes to be delivering hints simultaneously.) +hinted_handoff_throttle_in_kb: 1024 + +# Number of threads with which to deliver hints; +# Consider increasing this number when you have multi-dc deployments, since +# cross-dc handoff tends to be slower +max_hints_delivery_threads: 2 + +# Directory where Cassandra should store hints. +# If not set, the default directory is $CASSANDRA_HOME/data/hints. +# hints_directory: /var/lib/cassandra/hints + +# How often hints should be flushed from the internal buffers to disk. +# Will *not* trigger fsync. +hints_flush_period_in_ms: 10000 + +# Maximum size for a single hints file, in megabytes. +max_hints_file_size_in_mb: 128 + +# Compression to apply to the hint files. If omitted, hints files +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +#hints_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Maximum throttle in KBs per second, total. This will be +# reduced proportionally to the number of nodes in the cluster. +batchlog_replay_throttle_in_kb: 1024 + +# Authentication backend, implementing IAuthenticator; used to identify users +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator, +# PasswordAuthenticator}. +# +# - AllowAllAuthenticator performs no checks - set it to disable authentication. +# - PasswordAuthenticator relies on username/password pairs to authenticate +# users. It keeps usernames and hashed passwords in system_auth.roles table. +# Please increase system_auth keyspace replication factor if you use this authenticator. +# If using PasswordAuthenticator, CassandraRoleManager must also be used (see below) +authenticator: AllowAllAuthenticator + +# Authorization backend, implementing IAuthorizer; used to limit access/provide permissions +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer, +# CassandraAuthorizer}. +# +# - AllowAllAuthorizer allows any action to any user - set it to disable authorization. +# - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please +# increase system_auth keyspace replication factor if you use this authorizer. +authorizer: AllowAllAuthorizer + +# Part of the Authentication & Authorization backend, implementing IRoleManager; used +# to maintain grants and memberships between roles. +# Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager, +# which stores role information in the system_auth keyspace. Most functions of the +# IRoleManager require an authenticated login, so unless the configured IAuthenticator +# actually implements authentication, most of this functionality will be unavailable. +# +# - CassandraRoleManager stores role data in the system_auth keyspace. Please +# increase system_auth keyspace replication factor if you use this role manager. +role_manager: CassandraRoleManager + +# Validity period for roles cache (fetching granted roles can be an expensive +# operation depending on the role manager, CassandraRoleManager is one example) +# Granted roles are cached for authenticated sessions in AuthenticatedUser and +# after the period specified here, become eligible for (async) reload. +# Defaults to 2000, set to 0 to disable caching entirely. +# Will be disabled automatically for AllowAllAuthenticator. +roles_validity_in_ms: 2000 + +# Refresh interval for roles cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If roles_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as roles_validity_in_ms. +# roles_update_interval_in_ms: 2000 + +# Validity period for permissions cache (fetching permissions can be an +# expensive operation depending on the authorizer, CassandraAuthorizer is +# one example). Defaults to 2000, set to 0 to disable. +# Will be disabled automatically for AllowAllAuthorizer. +permissions_validity_in_ms: 2000 + +# Refresh interval for permissions cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If permissions_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as permissions_validity_in_ms. +# permissions_update_interval_in_ms: 2000 + +# Validity period for credentials cache. This cache is tightly coupled to +# the provided PasswordAuthenticator implementation of IAuthenticator. If +# another IAuthenticator implementation is configured, this cache will not +# be automatically used and so the following settings will have no effect. +# Please note, credentials are cached in their encrypted form, so while +# activating this cache may reduce the number of queries made to the +# underlying table, it may not bring a significant reduction in the +# latency of individual authentication attempts. +# Defaults to 2000, set to 0 to disable credentials caching. +credentials_validity_in_ms: 2000 + +# Refresh interval for credentials cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If credentials_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as credentials_validity_in_ms. +# credentials_update_interval_in_ms: 2000 + +# The partitioner is responsible for distributing groups of rows (by +# partition key) across nodes in the cluster. You should leave this +# alone for new clusters. The partitioner can NOT be changed without +# reloading all data, so when upgrading you should set this to the +# same partitioner you were already using. +# +# Besides Murmur3Partitioner, partitioners included for backwards +# compatibility include RandomPartitioner, ByteOrderedPartitioner, and +# OrderPreservingPartitioner. +# +partitioner: org.apache.cassandra.dht.Murmur3Partitioner + +# Directories where Cassandra should store data on disk. Cassandra +# will spread data evenly across them, subject to the granularity of +# the configured compaction strategy. +# If not set, the default directory is $CASSANDRA_HOME/data/data. +data_file_directories: + - /var/lib/cassandra/data + +# commit log. when running on magnetic HDD, this should be a +# separate spindle than the data directories. +# If not set, the default directory is $CASSANDRA_HOME/data/commitlog. +commitlog_directory: /var/lib/cassandra/commitlog + +# Enable / disable CDC functionality on a per-node basis. This modifies the logic used +# for write path allocation rejection (standard: never reject. cdc: reject Mutation +# containing a CDC-enabled table if at space limit in cdc_raw_directory). +cdc_enabled: false + +# CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the +# segment contains mutations for a CDC-enabled table. This should be placed on a +# separate spindle than the data directories. If not set, the default directory is +# $CASSANDRA_HOME/data/cdc_raw. +# cdc_raw_directory: /var/lib/cassandra/cdc_raw + +# Policy for data disk failures: +# +# die +# shut down gossip and client transports and kill the JVM for any fs errors or +# single-sstable errors, so the node can be replaced. +# +# stop_paranoid +# shut down gossip and client transports even for single-sstable errors, +# kill the JVM for errors during startup. +# +# stop +# shut down gossip and client transports, leaving the node effectively dead, but +# can still be inspected via JMX, kill the JVM for errors during startup. +# +# best_effort +# stop using the failed disk and respond to requests based on +# remaining available sstables. This means you WILL see obsolete +# data at CL.ONE! +# +# ignore +# ignore fatal errors and let requests fail, as in pre-1.2 Cassandra +disk_failure_policy: stop + +# Policy for commit disk failures: +# +# die +# shut down gossip and Thrift and kill the JVM, so the node can be replaced. +# +# stop +# shut down gossip and Thrift, leaving the node effectively dead, but +# can still be inspected via JMX. +# +# stop_commit +# shutdown the commit log, letting writes collect but +# continuing to service reads, as in pre-2.0.5 Cassandra +# +# ignore +# ignore fatal errors and let the batches fail +commit_failure_policy: stop + +# Maximum size of the native protocol prepared statement cache +# +# Valid values are either "auto" (omitting the value) or a value greater 0. +# +# Note that specifying a too large value will result in long running GCs and possbily +# out-of-memory errors. Keep the value at a small fraction of the heap. +# +# If you constantly see "prepared statements discarded in the last minute because +# cache limit reached" messages, the first step is to investigate the root cause +# of these messages and check whether prepared statements are used correctly - +# i.e. use bind markers for variable parts. +# +# Do only change the default value, if you really have more prepared statements than +# fit in the cache. In most cases it is not neccessary to change this value. +# Constantly re-preparing statements is a performance penalty. +# +# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater +prepared_statements_cache_size_mb: + +# Maximum size of the Thrift prepared statement cache +# +# If you do not use Thrift at all, it is safe to leave this value at "auto". +# +# See description of 'prepared_statements_cache_size_mb' above for more information. +# +# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater +thrift_prepared_statements_cache_size_mb: + +# Maximum size of the key cache in memory. +# +# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the +# minimum, sometimes more. The key cache is fairly tiny for the amount of +# time it saves, so it's worthwhile to use it at large numbers. +# The row cache saves even more time, but must contain the entire row, +# so it is extremely space-intensive. It's best to only use the +# row cache if you have hot rows or static rows. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache. +key_cache_size_in_mb: + +# Duration in seconds after which Cassandra should +# save the key cache. Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 14400 or 4 hours. +key_cache_save_period: 14400 + +# Number of keys from the key cache to save +# Disabled by default, meaning all keys are going to be saved +# key_cache_keys_to_save: 100 + +# Row cache implementation class name. Available implementations: +# +# org.apache.cassandra.cache.OHCProvider +# Fully off-heap row cache implementation (default). +# +# org.apache.cassandra.cache.SerializingCacheProvider +# This is the row cache implementation availabile +# in previous releases of Cassandra. +# row_cache_class_name: org.apache.cassandra.cache.OHCProvider + +# Maximum size of the row cache in memory. +# Please note that OHC cache implementation requires some additional off-heap memory to manage +# the map structures and some in-flight memory during operations before/after cache entries can be +# accounted against the cache capacity. This overhead is usually small compared to the whole capacity. +# Do not specify more memory that the system can afford in the worst usual situation and leave some +# headroom for OS block level cache. Do never allow your system to swap. +# +# Default value is 0, to disable row caching. +row_cache_size_in_mb: 0 + +# Duration in seconds after which Cassandra should save the row cache. +# Caches are saved to saved_caches_directory as specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 0 to disable saving the row cache. +row_cache_save_period: 0 + +# Number of keys from the row cache to save. +# Specify 0 (which is the default), meaning all keys are going to be saved +# row_cache_keys_to_save: 100 + +# Maximum size of the counter cache in memory. +# +# Counter cache helps to reduce counter locks' contention for hot counter cells. +# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before +# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration +# of the lock hold, helping with hot counter cell updates, but will not allow skipping +# the read entirely. Only the local (clock, count) tuple of a counter cell is kept +# in memory, not the whole counter, so it's relatively cheap. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache. +# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache. +counter_cache_size_in_mb: + +# Duration in seconds after which Cassandra should +# save the counter cache (keys only). Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Default is 7200 or 2 hours. +counter_cache_save_period: 7200 + +# Number of keys from the counter cache to save +# Disabled by default, meaning all keys are going to be saved +# counter_cache_keys_to_save: 100 + +# saved caches +# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches. +saved_caches_directory: /var/lib/cassandra/saved_caches + +# commitlog_sync may be either "periodic" or "batch." +# +# When in batch mode, Cassandra won't ack writes until the commit log +# has been fsynced to disk. It will wait +# commitlog_sync_batch_window_in_ms milliseconds between fsyncs. +# This window should be kept short because the writer threads will +# be unable to do extra work while waiting. (You may need to increase +# concurrent_writes for the same reason.) +# +# commitlog_sync: batch +# commitlog_sync_batch_window_in_ms: 2 +# +# the other option is "periodic" where writes may be acked immediately +# and the CommitLog is simply synced every commitlog_sync_period_in_ms +# milliseconds. +commitlog_sync: periodic +commitlog_sync_period_in_ms: 10000 + +# The size of the individual commitlog file segments. A commitlog +# segment may be archived, deleted, or recycled once all the data +# in it (potentially from each columnfamily in the system) has been +# flushed to sstables. +# +# The default size is 32, which is almost always fine, but if you are +# archiving commitlog segments (see commitlog_archiving.properties), +# then you probably want a finer granularity of archiving; 8 or 16 MB +# is reasonable. +# Max mutation size is also configurable via max_mutation_size_in_kb setting in +# cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024. +# This should be positive and less than 2048. +# +# NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must +# be set to at least twice the size of max_mutation_size_in_kb / 1024 +# +commitlog_segment_size_in_mb: 32 + +# Compression to apply to the commit log. If omitted, the commit log +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +# commitlog_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# any class that implements the SeedProvider interface and has a +# constructor that takes a Map of parameters will do. +seed_provider: + # Addresses of hosts that are deemed contact points. + # Cassandra nodes use this list of hosts to find each other and learn + # the topology of the ring. You must change this if you are running + # multiple nodes! + - class_name: org.apache.cassandra.locator.SimpleSeedProvider + parameters: + # seeds is actually a comma-delimited list of addresses. + # Ex: ",," + - seeds: "127.0.0.1" + +# For workloads with more data than can fit in memory, Cassandra's +# bottleneck will be reads that need to fetch data from +# disk. "concurrent_reads" should be set to (16 * number_of_drives) in +# order to allow the operations to enqueue low enough in the stack +# that the OS and drives can reorder them. Same applies to +# "concurrent_counter_writes", since counter writes read the current +# values before incrementing and writing them back. +# +# On the other hand, since writes are almost never IO bound, the ideal +# number of "concurrent_writes" is dependent on the number of cores in +# your system; (8 * number_of_cores) is a good rule of thumb. +concurrent_reads: 32 +concurrent_writes: 32 +concurrent_counter_writes: 32 + +# For materialized view writes, as there is a read involved, so this should +# be limited by the less of concurrent reads or concurrent writes. +concurrent_materialized_view_writes: 32 + +# Maximum memory to use for sstable chunk cache and buffer pooling. +# 32MB of this are reserved for pooling buffers, the rest is used as an +# cache that holds uncompressed sstable chunks. +# Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap, +# so is in addition to the memory allocated for heap. The cache also has on-heap +# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size +# if the default 64k chunk size is used). +# Memory is only allocated when needed. +# file_cache_size_in_mb: 512 + +# Flag indicating whether to allocate on or off heap when the sstable buffer +# pool is exhausted, that is when it has exceeded the maximum memory +# file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request. + +# buffer_pool_use_heap_if_exhausted: true + +# The strategy for optimizing disk read +# Possible values are: +# ssd (for solid state disks, the default) +# spinning (for spinning disks) +# disk_optimization_strategy: ssd + +# Total permitted memory to use for memtables. Cassandra will stop +# accepting writes when the limit is exceeded until a flush completes, +# and will trigger a flush based on memtable_cleanup_threshold +# If omitted, Cassandra will set both to 1/4 the size of the heap. +memtable_heap_space_in_mb: 2048 +memtable_offheap_space_in_mb: 2048 + +# memtable_cleanup_threshold is deprecated. The default calculation +# is the only reasonable choice. See the comments on memtable_flush_writers +# for more information. +# +# Ratio of occupied non-flushing memtable size to total permitted size +# that will trigger a flush of the largest memtable. Larger mct will +# mean larger flushes and hence less compaction, but also less concurrent +# flush activity which can make it difficult to keep your disks fed +# under heavy write load. +# +# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1) +# memtable_cleanup_threshold: 0.11 + +# Specify the way Cassandra allocates and manages memtable memory. +# Options are: +# +# heap_buffers +# on heap nio buffers +# +# offheap_buffers +# off heap (direct) nio buffers +# +# offheap_objects +# off heap objects +memtable_allocation_type: heap_buffers + +# Total space to use for commit logs on disk. +# +# If space gets above this value, Cassandra will flush every dirty CF +# in the oldest segment and remove it. So a small total commitlog space +# will tend to cause more flush activity on less-active columnfamilies. +# +# The default value is the smaller of 8192, and 1/4 of the total space +# of the commitlog volume. +# +# commitlog_total_space_in_mb: 8192 + +# This sets the number of memtable flush writer threads per disk +# as well as the total number of memtables that can be flushed concurrently. +# These are generally a combination of compute and IO bound. +# +# Memtable flushing is more CPU efficient than memtable ingest and a single thread +# can keep up with the ingest rate of a whole server on a single fast disk +# until it temporarily becomes IO bound under contention typically with compaction. +# At that point you need multiple flush threads. At some point in the future +# it may become CPU bound all the time. +# +# You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation +# metric which should be 0, but will be non-zero if threads are blocked waiting on flushing +# to free memory. +# +# memtable_flush_writers defaults to two for a single data directory. +# This means that two memtables can be flushed concurrently to the single data directory. +# If you have multiple data directories the default is one memtable flushing at a time +# but the flush will use a thread per data directory so you will get two or more writers. +# +# Two is generally enough to flush on a fast disk [array] mounted as a single data directory. +# Adding more flush writers will result in smaller more frequent flushes that introduce more +# compaction overhead. +# +# There is a direct tradeoff between number of memtables that can be flushed concurrently +# and flush size and frequency. More is not better you just need enough flush writers +# to never stall waiting for flushing to free memory. +# +#memtable_flush_writers: 2 + +# Total space to use for change-data-capture logs on disk. +# +# If space gets above this value, Cassandra will throw WriteTimeoutException +# on Mutations including tables with CDC enabled. A CDCCompactor is responsible +# for parsing the raw CDC logs and deleting them when parsing is completed. +# +# The default value is the min of 4096 mb and 1/8th of the total space +# of the drive where cdc_raw_directory resides. +# cdc_total_space_in_mb: 4096 + +# When we hit our cdc_raw limit and the CDCCompactor is either running behind +# or experiencing backpressure, we check at the following interval to see if any +# new space for cdc-tracked tables has been made available. Default to 250ms +# cdc_free_space_check_interval_ms: 250 + +# A fixed memory pool size in MB for for SSTable index summaries. If left +# empty, this will default to 5% of the heap size. If the memory usage of +# all index summaries exceeds this limit, SSTables with low read rates will +# shrink their index summaries in order to meet this limit. However, this +# is a best-effort process. In extreme conditions Cassandra may need to use +# more than this amount of memory. +index_summary_capacity_in_mb: + +# How frequently index summaries should be resampled. This is done +# periodically to redistribute memory from the fixed-size pool to sstables +# proportional their recent read rates. Setting to -1 will disable this +# process, leaving existing index summaries at their current sampling level. +index_summary_resize_interval_in_minutes: 60 + +# Whether to, when doing sequential writing, fsync() at intervals in +# order to force the operating system to flush the dirty +# buffers. Enable this to avoid sudden dirty buffer flushing from +# impacting read latencies. Almost always a good idea on SSDs; not +# necessarily on platters. +trickle_fsync: false +trickle_fsync_interval_in_kb: 10240 + +# TCP port, for commands and data +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +storage_port: 7000 + +# SSL port, for encrypted communication. Unused unless enabled in +# encryption_options +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +ssl_storage_port: 7001 + +# Address or interface to bind to and tell other Cassandra nodes to connect to. +# You _must_ change this if you want multiple nodes to be able to communicate! +# +# Set listen_address OR listen_interface, not both. +# +# Leaving it blank leaves it up to InetAddress.getLocalHost(). This +# will always do the Right Thing _if_ the node is properly configured +# (hostname, name resolution, etc), and the Right Thing is to use the +# address associated with the hostname (it might not be). +# +# Setting listen_address to 0.0.0.0 is always wrong. +# +listen_address: localhost + +# Set listen_address OR listen_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# listen_interface: eth0 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# listen_interface_prefer_ipv6: false + +# Address to broadcast to other Cassandra nodes +# Leaving this blank will set it to the same value as listen_address +# broadcast_address: 1.2.3.4 + +# When using multiple physical network interfaces, set this +# to true to listen on broadcast_address in addition to +# the listen_address, allowing nodes to communicate in both +# interfaces. +# Ignore this property if the network configuration automatically +# routes between the public and private networks such as EC2. +# listen_on_broadcast_address: false + +# Internode authentication backend, implementing IInternodeAuthenticator; +# used to allow/disallow connections from peer nodes. +# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator + +# Whether to start the native transport server. +# Please note that the address on which the native transport is bound is the +# same as the rpc_address. The port however is different and specified below. +start_native_transport: true +# port for the CQL native transport to listen for clients on +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +native_transport_port: 9042 +# Enabling native transport encryption in client_encryption_options allows you to either use +# encryption for the standard port or to use a dedicated, additional port along with the unencrypted +# standard native_transport_port. +# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption +# for native_transport_port. Setting native_transport_port_ssl to a different value +# from native_transport_port will use encryption for native_transport_port_ssl while +# keeping native_transport_port unencrypted. +native_transport_port_ssl: 9142 +# The maximum threads for handling requests when the native transport is used. +# This is similar to rpc_max_threads though the default differs slightly (and +# there is no native_transport_min_threads, idle threads will always be stopped +# after 30 seconds). +# native_transport_max_threads: 128 +# +# The maximum size of allowed frame. Frame (requests) larger than this will +# be rejected as invalid. The default is 256MB. If you're changing this parameter, +# you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048. +# native_transport_max_frame_size_in_mb: 256 + +# The maximum number of concurrent client connections. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections: -1 + +# The maximum number of concurrent client connections per source ip. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections_per_ip: -1 + +# Whether to start the thrift rpc server. +start_rpc: true + +# The address or interface to bind the Thrift RPC service and native transport +# server to. +# +# Set rpc_address OR rpc_interface, not both. +# +# Leaving rpc_address blank has the same effect as on listen_address +# (i.e. it will be based on the configured hostname of the node). +# +# Note that unlike listen_address, you can specify 0.0.0.0, but you must also +# set broadcast_rpc_address to a value other than 0.0.0.0. +# +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +rpc_address: 0.0.0.0 + +# Set rpc_address OR rpc_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# rpc_interface: eth1 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# rpc_interface_prefer_ipv6: false + +# port for Thrift to listen for clients on +rpc_port: 9160 + +# RPC address to broadcast to drivers and other Cassandra nodes. This cannot +# be set to 0.0.0.0. If left blank, this will be set to the value of +# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must +# be set. +broadcast_rpc_address: 1.2.3.4 + +# enable or disable keepalive on rpc/native connections +rpc_keepalive: true + +# Cassandra provides two out-of-the-box options for the RPC Server: +# +# sync +# One thread per thrift connection. For a very large number of clients, memory +# will be your limiting factor. On a 64 bit JVM, 180KB is the minimum stack size +# per thread, and that will correspond to your use of virtual memory (but physical memory +# may be limited depending on use of stack space). +# +# hsha +# Stands for "half synchronous, half asynchronous." All thrift clients are handled +# asynchronously using a small number of threads that does not vary with the amount +# of thrift clients (and thus scales well to many clients). The rpc requests are still +# synchronous (one thread per active request). If hsha is selected then it is essential +# that rpc_max_threads is changed from the default value of unlimited. +# +# The default is sync because on Windows hsha is about 30% slower. On Linux, +# sync/hsha performance is about the same, with hsha of course using less memory. +# +# Alternatively, can provide your own RPC server by providing the fully-qualified class name +# of an o.a.c.t.TServerFactory that can create an instance of it. +rpc_server_type: sync + +# Uncomment rpc_min|max_thread to set request pool size limits. +# +# Regardless of your choice of RPC server (see above), the number of maximum requests in the +# RPC thread pool dictates how many concurrent requests are possible (but if you are using the sync +# RPC server, it also dictates the number of clients that can be connected at all). +# +# The default is unlimited and thus provides no protection against clients overwhelming the server. You are +# encouraged to set a maximum that makes sense for you in production, but do keep in mind that +# rpc_max_threads represents the maximum number of client requests this server may execute concurrently. +# +# rpc_min_threads: 16 +# rpc_max_threads: 2048 + +# uncomment to set socket buffer sizes on rpc connections +# rpc_send_buff_size_in_bytes: +# rpc_recv_buff_size_in_bytes: + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# See also: +# /proc/sys/net/core/wmem_max +# /proc/sys/net/core/rmem_max +# /proc/sys/net/ipv4/tcp_wmem +# /proc/sys/net/ipv4/tcp_wmem +# and 'man tcp' +# internode_send_buff_size_in_bytes: + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# internode_recv_buff_size_in_bytes: + +# Frame size for thrift (maximum message length). +thrift_framed_transport_size_in_mb: 15 + +# Set to true to have Cassandra create a hard link to each sstable +# flushed or streamed locally in a backups/ subdirectory of the +# keyspace data. Removing these links is the operator's +# responsibility. +incremental_backups: false + +# Whether or not to take a snapshot before each compaction. Be +# careful using this option, since Cassandra won't clean up the +# snapshots for you. Mostly useful if you're paranoid when there +# is a data format change. +snapshot_before_compaction: false + +# Whether or not a snapshot is taken of the data before keyspace truncation +# or dropping of column families. The STRONGLY advised default of true +# should be used to provide data safety. If you set this flag to false, you will +# lose data on truncation or drop. +auto_snapshot: true + +# Granularity of the collation index of rows within a partition. +# Increase if your rows are large, or if you have a very large +# number of rows per partition. The competing goals are these: +# +# - a smaller granularity means more index entries are generated +# and looking up rows withing the partition by collation column +# is faster +# - but, Cassandra will keep the collation index in memory for hot +# rows (as part of the key cache), so a larger granularity means +# you can cache more hot rows +column_index_size_in_kb: 64 + +# Per sstable indexed key cache entries (the collation index in memory +# mentioned above) exceeding this size will not be held on heap. +# This means that only partition information is held on heap and the +# index entries are read from disk. +# +# Note that this size refers to the size of the +# serialized index information and not the size of the partition. +column_index_cache_size_in_kb: 2 + +# Number of simultaneous compactions to allow, NOT including +# validation "compactions" for anti-entropy repair. Simultaneous +# compactions can help preserve read performance in a mixed read/write +# workload, by mitigating the tendency of small sstables to accumulate +# during a single long running compactions. The default is usually +# fine and if you experience problems with compaction running too +# slowly or too fast, you should look at +# compaction_throughput_mb_per_sec first. +# +# concurrent_compactors defaults to the smaller of (number of disks, +# number of cores), with a minimum of 2 and a maximum of 8. +# +# If your data directories are backed by SSD, you should increase this +# to the number of cores. +#concurrent_compactors: 1 + +# Throttles compaction to the given total throughput across the entire +# system. The faster you insert data, the faster you need to compact in +# order to keep the sstable count down, but in general, setting this to +# 16 to 32 times the rate you are inserting data is more than sufficient. +# Setting this to 0 disables throttling. Note that this account for all types +# of compaction, including validation compaction. +compaction_throughput_mb_per_sec: 16 + +# When compacting, the replacement sstable(s) can be opened before they +# are completely written, and used in place of the prior sstables for +# any range that has been written. This helps to smoothly transfer reads +# between the sstables, reducing page cache churn and keeping hot rows hot +sstable_preemptive_open_interval_in_mb: 50 + +# Throttles all outbound streaming file transfers on this node to the +# given total throughput in Mbps. This is necessary because Cassandra does +# mostly sequential IO when streaming data during bootstrap or repair, which +# can lead to saturating the network connection and degrading rpc performance. +# When unset, the default is 200 Mbps or 25 MB/s. +# stream_throughput_outbound_megabits_per_sec: 200 + +# Throttles all streaming file transfer between the datacenters, +# this setting allows users to throttle inter dc stream throughput in addition +# to throttling all network stream traffic as configured with +# stream_throughput_outbound_megabits_per_sec +# When unset, the default is 200 Mbps or 25 MB/s +# inter_dc_stream_throughput_outbound_megabits_per_sec: 200 + +# How long the coordinator should wait for read operations to complete +read_request_timeout_in_ms: 5000 +# How long the coordinator should wait for seq or index scans to complete +range_request_timeout_in_ms: 10000 +# How long the coordinator should wait for writes to complete +write_request_timeout_in_ms: 2000 +# How long the coordinator should wait for counter writes to complete +counter_write_request_timeout_in_ms: 5000 +# How long a coordinator should continue to retry a CAS operation +# that contends with other proposals for the same row +cas_contention_timeout_in_ms: 1000 +# How long the coordinator should wait for truncates to complete +# (This can be much longer, because unless auto_snapshot is disabled +# we need to flush first so we can snapshot before removing the data.) +truncate_request_timeout_in_ms: 60000 +# The default timeout for other, miscellaneous operations +request_timeout_in_ms: 10000 + +# How long before a node logs slow queries. Select queries that take longer than +# this timeout to execute, will generate an aggregated log message, so that slow queries +# can be identified. Set this value to zero to disable slow query logging. +slow_query_log_timeout_in_ms: 500 + +# Enable operation timeout information exchange between nodes to accurately +# measure request timeouts. If disabled, replicas will assume that requests +# were forwarded to them instantly by the coordinator, which means that +# under overload conditions we will waste that much extra time processing +# already-timed-out requests. +# +# Warning: before enabling this property make sure to ntp is installed +# and the times are synchronized between the nodes. +cross_node_timeout: false + +# Set keep-alive period for streaming +# This node will send a keep-alive message periodically with this period. +# If the node does not receive a keep-alive message from the peer for +# 2 keep-alive cycles the stream session times out and fail +# Default value is 300s (5 minutes), which means stalled stream +# times out in 10 minutes by default +# streaming_keep_alive_period_in_secs: 300 + +# phi value that must be reached for a host to be marked down. +# most users should never need to adjust this. +# phi_convict_threshold: 8 + +# endpoint_snitch -- Set this to a class that implements +# IEndpointSnitch. The snitch has two functions: +# +# - it teaches Cassandra enough about your network topology to route +# requests efficiently +# - it allows Cassandra to spread replicas around your cluster to avoid +# correlated failures. It does this by grouping machines into +# "datacenters" and "racks." Cassandra will do its best not to have +# more than one replica on the same "rack" (which may not actually +# be a physical location) +# +# CASSANDRA WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH +# ONCE DATA IS INSERTED INTO THE CLUSTER. This would cause data loss. +# This means that if you start with the default SimpleSnitch, which +# locates every node on "rack1" in "datacenter1", your only options +# if you need to add another datacenter are GossipingPropertyFileSnitch +# (and the older PFS). From there, if you want to migrate to an +# incompatible snitch like Ec2Snitch you can do it by adding new nodes +# under Ec2Snitch (which will locate them in a new "datacenter") and +# decommissioning the old ones. +# +# Out of the box, Cassandra provides: +# +# SimpleSnitch: +# Treats Strategy order as proximity. This can improve cache +# locality when disabling read repair. Only appropriate for +# single-datacenter deployments. +# +# GossipingPropertyFileSnitch +# This should be your go-to snitch for production use. The rack +# and datacenter for the local node are defined in +# cassandra-rackdc.properties and propagated to other nodes via +# gossip. If cassandra-topology.properties exists, it is used as a +# fallback, allowing migration from the PropertyFileSnitch. +# +# PropertyFileSnitch: +# Proximity is determined by rack and data center, which are +# explicitly configured in cassandra-topology.properties. +# +# Ec2Snitch: +# Appropriate for EC2 deployments in a single Region. Loads Region +# and Availability Zone information from the EC2 API. The Region is +# treated as the datacenter, and the Availability Zone as the rack. +# Only private IPs are used, so this will not work across multiple +# Regions. +# +# Ec2MultiRegionSnitch: +# Uses public IPs as broadcast_address to allow cross-region +# connectivity. (Thus, you should set seed addresses to the public +# IP as well.) You will need to open the storage_port or +# ssl_storage_port on the public IP firewall. (For intra-Region +# traffic, Cassandra will switch to the private IP after +# establishing a connection.) +# +# RackInferringSnitch: +# Proximity is determined by rack and data center, which are +# assumed to correspond to the 3rd and 2nd octet of each node's IP +# address, respectively. Unless this happens to match your +# deployment conventions, this is best used as an example of +# writing a custom Snitch class and is provided in that spirit. +# +# You can use a custom Snitch by setting this to the full class name +# of the snitch, which will be assumed to be on your classpath. +endpoint_snitch: SimpleSnitch + +# controls how often to perform the more expensive part of host score +# calculation +dynamic_snitch_update_interval_in_ms: 100 +# controls how often to reset all host scores, allowing a bad host to +# possibly recover +dynamic_snitch_reset_interval_in_ms: 600000 +# if set greater than zero and read_repair_chance is < 1.0, this will allow +# 'pinning' of replicas to hosts in order to increase cache capacity. +# The badness threshold will control how much worse the pinned host has to be +# before the dynamic snitch will prefer other replicas over it. This is +# expressed as a double which represents a percentage. Thus, a value of +# 0.2 means Cassandra would continue to prefer the static snitch values +# until the pinned host was 20% worse than the fastest. +dynamic_snitch_badness_threshold: 0.1 + +# request_scheduler -- Set this to a class that implements +# RequestScheduler, which will schedule incoming client requests +# according to the specific policy. This is useful for multi-tenancy +# with a single Cassandra cluster. +# NOTE: This is specifically for requests from the client and does +# not affect inter node communication. +# org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place +# org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of +# client requests to a node with a separate queue for each +# request_scheduler_id. The scheduler is further customized by +# request_scheduler_options as described below. +request_scheduler: org.apache.cassandra.scheduler.NoScheduler + +# Scheduler Options vary based on the type of scheduler +# +# NoScheduler +# Has no options +# +# RoundRobin +# throttle_limit +# The throttle_limit is the number of in-flight +# requests per client. Requests beyond +# that limit are queued up until +# running requests can complete. +# The value of 80 here is twice the number of +# concurrent_reads + concurrent_writes. +# default_weight +# default_weight is optional and allows for +# overriding the default which is 1. +# weights +# Weights are optional and will default to 1 or the +# overridden default_weight. The weight translates into how +# many requests are handled during each turn of the +# RoundRobin, based on the scheduler id. +# +# request_scheduler_options: +# throttle_limit: 80 +# default_weight: 5 +# weights: +# Keyspace1: 1 +# Keyspace2: 5 + +# request_scheduler_id -- An identifier based on which to perform +# the request scheduling. Currently the only valid option is keyspace. +# request_scheduler_id: keyspace + +# Enable or disable inter-node encryption +# JVM defaults for supported SSL socket protocols and cipher suites can +# be replaced using custom encryption options. This is not recommended +# unless you have policies in place that dictate certain settings, or +# need to disable vulnerable ciphers or protocols in case the JVM cannot +# be updated. +# FIPS compliant settings can be configured at JVM level and should not +# involve changing encryption settings here: +# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html +# *NOTE* No custom encryption options are enabled at the moment +# The available internode options are : all, none, dc, rack +# +# If set to dc cassandra will encrypt the traffic between the DCs +# If set to rack cassandra will encrypt the traffic between the racks +# +# The passwords used in these options must match the passwords used when generating +# the keystore and truststore. For instructions on generating these files, see: +# http://download.oracle.com/javase/6/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# +server_encryption_options: + internode_encryption: none + keystore: conf/.keystore + keystore_password: cassandra + truststore: conf/.truststore + truststore_password: cassandra + # More advanced defaults below: + # protocol: TLS + # algorithm: SunX509 + # store_type: JKS + # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + # require_client_auth: false + # require_endpoint_verification: false + +# enable or disable client/server encryption. +client_encryption_options: + enabled: true + # If enabled and optional is set to true encrypted and unencrypted connections are handled. + optional: false + keystore: /certs/server.jks + keystore_password: my_password + require_client_auth: true + # Set trustore and truststore_password if require_client_auth is true + truststore: /certs/truststore.jks + truststore_password: my_password + # More advanced defaults below: + protocol: TLS + store_type: JKS + cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + +# internode_compression controls whether traffic between nodes is +# compressed. +# Can be: +# +# all +# all traffic is compressed +# +# dc +# traffic between different datacenters is compressed +# +# none +# nothing is compressed. +internode_compression: dc + +# Enable or disable tcp_nodelay for inter-dc communication. +# Disabling it will result in larger (but fewer) network packets being sent, +# reducing overhead from the TCP protocol itself, at the cost of increasing +# latency if you block for cross-datacenter responses. +inter_dc_tcp_nodelay: false + +# TTL for different trace types used during logging of the repair process. +tracetype_query_ttl: 86400 +tracetype_repair_ttl: 604800 + +# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level +# This threshold can be adjusted to minimize logging if necessary +# gc_log_threshold_in_ms: 200 + +# If unset, all GC Pauses greater than gc_log_threshold_in_ms will log at +# INFO level +# UDFs (user defined functions) are disabled by default. +# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code. +enable_user_defined_functions: false + +# Enables scripted UDFs (JavaScript UDFs). +# Java UDFs are always enabled, if enable_user_defined_functions is true. +# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider. +# This option has no effect, if enable_user_defined_functions is false. +enable_scripted_user_defined_functions: false + +# Enables materialized view creation on this node. +# Materialized views are considered experimental and are not recommended for production use. +enable_materialized_views: true + +# The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation. +# Lowering this value on Windows can provide much tighter latency and better throughput, however +# some virtualized environments may see a negative performance impact from changing this setting +# below their system default. The sysinternals 'clockres' tool can confirm your system's default +# setting. +windows_timer_interval: 1 + + +# Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from +# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by +# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys +# can still (and should!) be in the keystore and will be used on decrypt operations +# (to handle the case of key rotation). +# +# It is strongly recommended to download and install Java Cryptography Extension (JCE) +# Unlimited Strength Jurisdiction Policy Files for your version of the JDK. +# (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html) +# +# Currently, only the following file types are supported for transparent data encryption, although +# more are coming in future cassandra releases: commitlog, hints +transparent_data_encryption_options: + enabled: false + chunk_length_kb: 64 + cipher: AES/CBC/PKCS5Padding + key_alias: testing:1 + # CBC IV length for AES needs to be 16 bytes (which is also the default size) + # iv_length: 16 + key_provider: + - class_name: org.apache.cassandra.security.JKSKeyProvider + parameters: + - keystore: conf/.keystore + keystore_password: cassandra + store_type: JCEKS + key_password: cassandra + + +##################### +# SAFETY THRESHOLDS # +##################### + +# When executing a scan, within or across a partition, we need to keep the +# tombstones seen in memory so we can return them to the coordinator, which +# will use them to make sure other replicas also know about the deleted rows. +# With workloads that generate a lot of tombstones, this can cause performance +# problems and even exaust the server heap. +# (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets) +# Adjust the thresholds here if you understand the dangers and want to +# scan more tombstones anyway. These thresholds may also be adjusted at runtime +# using the StorageService mbean. +tombstone_warn_threshold: 1000 +tombstone_failure_threshold: 100000 + +# Log WARN on any multiple-partition batch size exceeding this value. 5kb per batch by default. +# Caution should be taken on increasing the size of this threshold as it can lead to node instability. +batch_size_warn_threshold_in_kb: 5 + +# Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default. +batch_size_fail_threshold_in_kb: 50 + +# Log WARN on any batches not of type LOGGED than span across more partitions than this limit +unlogged_batch_across_partitions_warn_threshold: 10 + +# Log a warning when compacting partitions larger than this value +compaction_large_partition_warning_threshold_mb: 100 + +# GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level +# Adjust the threshold based on your application throughput requirement +# By default, Cassandra logs GC Pauses greater than 200 ms at INFO level +gc_warn_threshold_in_ms: 1000 + +# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption +# early. Any value size larger than this threshold will result into marking an SSTable +# as corrupted. This should be positive and less than 2048. +# max_value_size_in_mb: 256 + +# Back-pressure settings # +# If enabled, the coordinator will apply the back-pressure strategy specified below to each mutation +# sent to replicas, with the aim of reducing pressure on overloaded replicas. +back_pressure_enabled: false +# The back-pressure strategy applied. +# The default implementation, RateBasedBackPressure, takes three arguments: +# high ratio, factor, and flow type, and uses the ratio between incoming mutation responses and outgoing mutation requests. +# If below high ratio, outgoing mutations are rate limited according to the incoming rate decreased by the given factor; +# if above high ratio, the rate limiting is increased by the given factor; +# such factor is usually best configured between 1 and 10, use larger values for a faster recovery +# at the expense of potentially more dropped mutations; +# the rate limiting is applied according to the flow type: if FAST, it's rate limited at the speed of the fastest replica, +# if SLOW at the speed of the slowest one. +# New strategies can be added. Implementors need to implement org.apache.cassandra.net.BackpressureStrategy and +# provide a public constructor accepting a Map. +back_pressure_strategy: + - class_name: org.apache.cassandra.net.RateBasedBackPressure + parameters: + - high_ratio: 0.90 + factor: 5 + flow: FAST + +# Coalescing Strategies # +# Coalescing multiples messages turns out to significantly boost message processing throughput (think doubling or more). +# On bare metal, the floor for packet processing throughput is high enough that many applications won't notice, but in +# virtualized environments, the point at which an application can be bound by network packet processing can be +# surprisingly low compared to the throughput of task processing that is possible inside a VM. It's not that bare metal +# doesn't benefit from coalescing messages, it's that the number of packets a bare metal network interface can process +# is sufficient for many applications such that no load starvation is experienced even without coalescing. +# There are other benefits to coalescing network messages that are harder to isolate with a simple metric like messages +# per second. By coalescing multiple tasks together, a network thread can process multiple messages for the cost of one +# trip to read from a socket, and all the task submission work can be done at the same time reducing context switching +# and increasing cache friendliness of network message processing. +# See CASSANDRA-8692 for details. + +# Strategy to use for coalescing messages in OutboundTcpConnection. +# Can be fixed, movingaverage, timehorizon, disabled (default). +# You can also specify a subclass of CoalescingStrategies.CoalescingStrategy by name. +# otc_coalescing_strategy: DISABLED + +# How many microseconds to wait for coalescing. For fixed strategy this is the amount of time after the first +# message is received before it will be sent with any accompanying messages. For moving average this is the +# maximum amount of time that will be waited as well as the interval at which messages must arrive on average +# for coalescing to be enabled. +# otc_coalescing_window_us: 200 + +# Do not try to coalesce messages if we already got that many messages. This should be more than 2 and less than 128. +# otc_coalescing_enough_coalesced_messages: 8 + +# How many milliseconds to wait between two expiration runs on the backlog (queue) of the OutboundTcpConnection. +# Expiration is done if messages are piling up in the backlog. Droppable messages are expired to free the memory +# taken by expired messages. The interval should be between 0 and 1000, and in most installations the default value +# will be appropriate. A smaller value could potentially expire messages slightly sooner at the expense of more CPU +# time and queue contention while iterating the backlog of messages. +# An interval of 0 disables any wait time, which is the behavior of former Cassandra versions. +# +# otc_backlog_expiration_interval_ms: 200 diff --git a/.ci/docker-compose-file/docker-compose-cassandra.yaml b/.ci/docker-compose-file/docker-compose-cassandra.yaml index f7143f471..918a61037 100644 --- a/.ci/docker-compose-file/docker-compose-cassandra.yaml +++ b/.ci/docker-compose-file/docker-compose-cassandra.yaml @@ -1,32 +1,38 @@ version: '3.9' +x-cassandra: &cassandra + restart: always + image: cassandra:${CASSANDRA_TAG:-3.11.6} + environment: + CASSANDRA_BROADCAST_ADDRESS: "1.2.3.4" + CASSANDRA_RPC_ADDRESS: "0.0.0.0" + HEAP_NEWSIZE: "128M" + MAX_HEAP_SIZE: "2048M" + #ports: + # - "9042:9042" + # - "9142:9142" + command: + - /bin/bash + - -c + - | + /opt/cassandra/bin/cassandra -f -R > /cassandra.log & + /opt/cassandra/bin/cqlsh -u cassandra -p cassandra -e "CREATE KEYSPACE mqtt WITH REPLICATION = { 'class':'SimpleStrategy','replication_factor':1};" + while [[ $$? -ne 0 ]];do sleep 5; /opt/cassandra/bin/cqlsh -u cassandra -p cassandra -e "CREATE KEYSPACE mqtt WITH REPLICATION = { 'class':'SimpleStrategy','replication_factor':1};"; done + /opt/cassandra/bin/cqlsh -u cassandra -p cassandra -e "describe keyspaces;" + tail -f /cassandra.log + networks: + - emqx_bridge + services: cassandra_server: + <<: *cassandra container_name: cassandra - build: - context: ./cassandra - args: - CASSANDRA_TAG: ${CASSANDRA_TAG} - image: emqx-cassandra - restart: always - environment: - CASSANDRA_BROADCAST_ADDRESS: "1.2.3.4" - CASSANDRA_RPC_ADDRESS: "0.0.0.0" - HEAP_NEWSIZE: "128M" - MAX_HEAP_SIZE: "2048M" volumes: - ./certs:/certs - #ports: - # - "9042:9042" - # - "9142:9142" - command: - - /bin/bash - - -c - - | - /opt/cassandra/bin/cassandra -f -R > /cassandra.log & - /opt/cassandra/bin/cqlsh -u cassandra -p cassandra -e "CREATE KEYSPACE mqtt WITH REPLICATION = { 'class':'SimpleStrategy','replication_factor':1};" - while [[ $$? -ne 0 ]];do sleep 5; /opt/cassandra/bin/cqlsh -u cassandra -p cassandra -e "CREATE KEYSPACE mqtt WITH REPLICATION = { 'class':'SimpleStrategy','replication_factor':1};"; done - /opt/cassandra/bin/cqlsh -u cassandra -p cassandra -e "describe keyspaces;" - tail -f /cassandra.log - networks: - - emqx_bridge + - ./cassandra/cassandra.yaml:/etc/cassandra/cassandra.yaml + cassandra_noauth_server: + <<: *cassandra + container_name: cassandra_noauth + volumes: + - ./certs:/certs + - ./cassandra/cassandra_noauth.yaml:/etc/cassandra/cassandra.yaml diff --git a/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_connector_SUITE.erl b/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_connector_SUITE.erl index bceae1fd2..fcd482b47 100644 --- a/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_connector_SUITE.erl +++ b/apps/emqx_bridge_cassandra/test/emqx_bridge_cassandra_connector_SUITE.erl @@ -7,15 +7,17 @@ -compile(nowarn_export_all). -compile(export_all). +-include_lib("common_test/include/ct.hrl"). -include("emqx_bridge_cassandra.hrl"). -include("emqx_connector/include/emqx_connector.hrl"). -include_lib("eunit/include/eunit.hrl"). -include_lib("emqx/include/emqx.hrl"). -include_lib("stdlib/include/assert.hrl"). -%% Cassandra server defined at `.ci/docker-compose-file/docker-compose-cassandra-tcp.yaml` +%% Cassandra servers are defined at `.ci/docker-compose-file/docker-compose-cassandra.yaml` %% You can change it to `127.0.0.1`, if you run this SUITE locally -define(CASSANDRA_HOST, "cassandra"). +-define(CASSANDRA_HOST_NOAUTH, "cassandra_noauth"). -define(CASSANDRA_RESOURCE_MOD, emqx_bridge_cassandra_connector). %% This test SUITE requires a running cassandra instance. If you don't want to @@ -32,40 +34,58 @@ -define(CASSA_PASSWORD, <<"cassandra">>). all() -> - emqx_common_test_helpers:all(?MODULE). + [ + {group, auth}, + {group, noauth} + ]. groups() -> - []. + TCs = emqx_common_test_helpers:all(?MODULE), + [ + {auth, TCs}, + {noauth, TCs} + ]. -cassandra_servers() -> +cassandra_servers(CassandraHost) -> lists:map( fun(#{hostname := Host, port := Port}) -> {Host, Port} end, emqx_schema:parse_servers( - iolist_to_binary([?CASSANDRA_HOST, ":", erlang:integer_to_list(?CASSANDRA_DEFAULT_PORT)]), + iolist_to_binary([CassandraHost, ":", erlang:integer_to_list(?CASSANDRA_DEFAULT_PORT)]), #{default_port => ?CASSANDRA_DEFAULT_PORT} ) ). init_per_suite(Config) -> - case - emqx_common_test_helpers:is_tcp_server_available(?CASSANDRA_HOST, ?CASSANDRA_DEFAULT_PORT) - of + ok = emqx_common_test_helpers:start_apps([emqx_conf]), + ok = emqx_connector_test_helpers:start_apps([emqx_resource]), + {ok, _} = application:ensure_all_started(emqx_connector), + Config. + +init_per_group(Group, Config) -> + {CassandraHost, AuthOpts} = + case Group of + auth -> + {?CASSANDRA_HOST, [{username, ?CASSA_USERNAME}, {password, ?CASSA_PASSWORD}]}; + noauth -> + {?CASSANDRA_HOST_NOAUTH, []} + end, + case emqx_common_test_helpers:is_tcp_server_available(CassandraHost, ?CASSANDRA_DEFAULT_PORT) of true -> - ok = emqx_common_test_helpers:start_apps([emqx_conf]), - ok = emqx_connector_test_helpers:start_apps([emqx_resource]), - {ok, _} = application:ensure_all_started(emqx_connector), %% keyspace `mqtt` must be created in advance {ok, Conn} = ecql:connect([ - {nodes, cassandra_servers()}, - {username, ?CASSA_USERNAME}, - {password, ?CASSA_PASSWORD}, + {nodes, cassandra_servers(CassandraHost)}, {keyspace, "mqtt"} + | AuthOpts ]), ecql:close(Conn), - Config; + [ + {cassa_host, CassandraHost}, + {cassa_auth_opts, AuthOpts} + | Config + ]; false -> case os:getenv("IS_CI") of "yes" -> @@ -75,6 +95,9 @@ init_per_suite(Config) -> end end. +end_per_group(_Group, _Config) -> + ok. + end_per_suite(_Config) -> ok = emqx_common_test_helpers:stop_apps([emqx_conf]), ok = emqx_connector_test_helpers:stop_apps([emqx_resource]), @@ -90,10 +113,10 @@ end_per_testcase(_, _Config) -> %% cases %%-------------------------------------------------------------------- -t_lifecycle(_Config) -> +t_lifecycle(Config) -> perform_lifecycle_check( <<"emqx_connector_cassandra_SUITE">>, - cassandra_config() + cassandra_config(Config) ). show(X) -> @@ -168,25 +191,25 @@ perform_lifecycle_check(ResourceId, InitialConfig) -> %% utils %%-------------------------------------------------------------------- -cassandra_config() -> - Config = - #{ +cassandra_config(Config) -> + Host = ?config(cassa_host, Config), + AuthOpts = maps:from_list(?config(cassa_auth_opts, Config)), + CassConfig = + AuthOpts#{ auto_reconnect => true, keyspace => <<"mqtt">>, - username => ?CASSA_USERNAME, - password => ?CASSA_PASSWORD, pool_size => 8, servers => iolist_to_binary( io_lib:format( "~s:~b", [ - ?CASSANDRA_HOST, + Host, ?CASSANDRA_DEFAULT_PORT ] ) ) }, - #{<<"config">> => Config}. + #{<<"config">> => CassConfig}. test_query_no_params() -> {query, <<"SELECT count(1) AS T FROM system.local">>}. From bc9b2703081044b0e25289b8c161f1923c355f00 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 1 Sep 2023 17:09:57 +0300 Subject: [PATCH 76/85] chore: add changelog for bridges `function_clause` fixes --- changes/ee/fix-11547.en.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 changes/ee/fix-11547.en.md diff --git a/changes/ee/fix-11547.en.md b/changes/ee/fix-11547.en.md new file mode 100644 index 000000000..1a79b32ea --- /dev/null +++ b/changes/ee/fix-11547.en.md @@ -0,0 +1,7 @@ +Fix several emqx_bridge issues: + +- fix Cassandra bridge connect error occurring when the bridge is configured without username/password + (Cassandra doesn't require user credentials when it is configured with `authenticator: AllowAllAuthenticator`) +- fix SQL Server bridge connect error caused by an empty password +- make `username` a required field in Oracle bridge +- fix IoTDB bridge error caused by setting base URL without scheme (e.g. `:`) From ed9afe33456ff036999854606a786d00f7f722f7 Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Fri, 1 Sep 2023 19:07:23 +0300 Subject: [PATCH 77/85] refactor(emqx_connector): use `hocon_schema:override/2` to make pgsql 'username' field required --- apps/emqx_connector/src/emqx_connector_pgsql.erl | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/apps/emqx_connector/src/emqx_connector_pgsql.erl b/apps/emqx_connector/src/emqx_connector_pgsql.erl index 04ba4fd51..5cc25bfa1 100644 --- a/apps/emqx_connector/src/emqx_connector_pgsql.erl +++ b/apps/emqx_connector/src/emqx_connector_pgsql.erl @@ -85,13 +85,10 @@ server() -> adjust_fields(Fields) -> lists:map( fun - ({username, OrigUsernameFn}) -> - {username, fun - (required) -> - true; - (Any) -> - OrigUsernameFn(Any) - end}; + ({username, Sc}) -> + %% to please dialyzer... + Override = #{type => hocon_schema:field_schema(Sc, type), required => true}, + {username, hocon_schema:override(Sc, Override)}; (Field) -> Field end, From 24230a64df5f4a7b1347607a594c917904587779 Mon Sep 17 00:00:00 2001 From: Ivan Dyachkov Date: Mon, 4 Sep 2023 11:39:21 +0200 Subject: [PATCH 78/85] chore: bump app versions --- apps/emqx/src/emqx.app.src | 2 +- apps/emqx_authn/src/emqx_authn.app.src | 2 +- apps/emqx_connector/src/emqx_connector.app.src | 2 +- apps/emqx_ft/src/emqx_ft.app.src | 2 +- apps/emqx_management/src/emqx_management.app.src | 2 +- apps/emqx_modules/src/emqx_modules.app.src | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/emqx/src/emqx.app.src b/apps/emqx/src/emqx.app.src index a80d6482a..7a98579df 100644 --- a/apps/emqx/src/emqx.app.src +++ b/apps/emqx/src/emqx.app.src @@ -2,7 +2,7 @@ {application, emqx, [ {id, "emqx"}, {description, "EMQX Core"}, - {vsn, "5.1.8"}, + {vsn, "5.1.9"}, {modules, []}, {registered, []}, {applications, [ diff --git a/apps/emqx_authn/src/emqx_authn.app.src b/apps/emqx_authn/src/emqx_authn.app.src index ae7bea5da..1050bc496 100644 --- a/apps/emqx_authn/src/emqx_authn.app.src +++ b/apps/emqx_authn/src/emqx_authn.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_authn, [ {description, "EMQX Authentication"}, - {vsn, "0.1.25"}, + {vsn, "0.1.26"}, {modules, []}, {registered, [emqx_authn_sup, emqx_authn_registry]}, {applications, [ diff --git a/apps/emqx_connector/src/emqx_connector.app.src b/apps/emqx_connector/src/emqx_connector.app.src index 397cd0093..b43122b36 100644 --- a/apps/emqx_connector/src/emqx_connector.app.src +++ b/apps/emqx_connector/src/emqx_connector.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_connector, [ {description, "EMQX Data Integration Connectors"}, - {vsn, "0.1.30"}, + {vsn, "0.1.31"}, {registered, []}, {mod, {emqx_connector_app, []}}, {applications, [ diff --git a/apps/emqx_ft/src/emqx_ft.app.src b/apps/emqx_ft/src/emqx_ft.app.src index 8518958e0..f75b1805b 100644 --- a/apps/emqx_ft/src/emqx_ft.app.src +++ b/apps/emqx_ft/src/emqx_ft.app.src @@ -1,6 +1,6 @@ {application, emqx_ft, [ {description, "EMQX file transfer over MQTT"}, - {vsn, "0.1.5"}, + {vsn, "0.1.6"}, {registered, []}, {mod, {emqx_ft_app, []}}, {applications, [ diff --git a/apps/emqx_management/src/emqx_management.app.src b/apps/emqx_management/src/emqx_management.app.src index e1056ab0c..f16156c35 100644 --- a/apps/emqx_management/src/emqx_management.app.src +++ b/apps/emqx_management/src/emqx_management.app.src @@ -2,7 +2,7 @@ {application, emqx_management, [ {description, "EMQX Management API and CLI"}, % strict semver, bump manually! - {vsn, "5.0.28"}, + {vsn, "5.0.29"}, {modules, []}, {registered, [emqx_management_sup]}, {applications, [kernel, stdlib, emqx_plugins, minirest, emqx, emqx_ctl, emqx_bridge_http]}, diff --git a/apps/emqx_modules/src/emqx_modules.app.src b/apps/emqx_modules/src/emqx_modules.app.src index cd2f6c8b9..cceb51895 100644 --- a/apps/emqx_modules/src/emqx_modules.app.src +++ b/apps/emqx_modules/src/emqx_modules.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_modules, [ {description, "EMQX Modules"}, - {vsn, "5.0.20"}, + {vsn, "5.0.21"}, {modules, []}, {applications, [kernel, stdlib, emqx, emqx_ctl]}, {mod, {emqx_modules_app, []}}, From 6d984edb13f98425b932061b32b732f3a0afbe5a Mon Sep 17 00:00:00 2001 From: Ivan Dyachkov Date: Mon, 4 Sep 2023 11:53:47 +0200 Subject: [PATCH 79/85] chore: fix typos and use consistent wording in the changelog --- changes/v5.1.6.en.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/changes/v5.1.6.en.md b/changes/v5.1.6.en.md index 3c393c55b..1e92f61d1 100644 --- a/changes/v5.1.6.en.md +++ b/changes/v5.1.6.en.md @@ -4,18 +4,18 @@ - [#11429](https://github.com/emqx/emqx/pull/11429) Added option to configure detection of legacy protocol in MondoDB connectors and bridges. -- [#11436](https://github.com/emqx/emqx/pull/11436) Add a new API endpoint `DELETE /banned` to clear all `banned` data. +- [#11436](https://github.com/emqx/emqx/pull/11436) Added a new API endpoint `DELETE /banned` to clear all `banned` data. -- [#11438](https://github.com/emqx/emqx/pull/11438) Changed the type of the `mqtt.mqx_packet_size` from string to byteSize to better represent the valid numeric range. +- [#11438](https://github.com/emqx/emqx/pull/11438) Changed the type of the `mqtt.max_packet_size` from string to byteSize to better represent the valid numeric range. Strings will still be accepted for backwards compatibility. - [#11446](https://github.com/emqx/emqx/pull/11446) Refactored datetime-related modules and functions to simplify the code. -- [#11396](https://github.com/emqx/emqx/pull/11396) Introduce topic index for the rule engine runtime that significantly improves the performance of EMQX with a non-trivial number of rules consuming messages matching different topic filters. +- [#11396](https://github.com/emqx/emqx/pull/11396) Introduced topic index for the rule engine runtime that significantly improves the performance of EMQX with a non-trivial number of rules consuming messages matching different topic filters. ## Bug Fixes -- [#11424](https://github.com/emqx/emqx/pull/11424) Add a check for the maximum value of the timestamp in the API to ensure it is a valid Unix timestamp. +- [#11424](https://github.com/emqx/emqx/pull/11424) Added a check for the maximum value of the timestamp in the API to ensure it is a valid Unix timestamp. - [#11445](https://github.com/emqx/emqx/pull/11445) Removed os_mon application monitor support on Windows platforms to prevent VM crashes. Functionality remains on non-Windows platforms. @@ -25,6 +25,6 @@ - [#11456](https://github.com/emqx/emqx/pull/11456) Removed validation that enforced non-empty PEM for CA cert file. CA certificate file PEM can now be empty. -- [#11499](https://github.com/emqx/emqx/pull/11499) Upgrade Erlang/OTP to 25.3.2-2 +- [#11499](https://github.com/emqx/emqx/pull/11499) Upgraded Erlang/OTP to 25.3.2-2. Erlang/OTP 25.3.2-2 excludes sensitive data from mnesia_hook log message. From 60ae3c15c826f6cc5ca961a550fbbf44794058cb Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 4 Sep 2023 14:06:13 -0300 Subject: [PATCH 80/85] refactor: disregard impossible case --- apps/emqx/src/emqx_persistent_session_ds.erl | 15 +++++---------- apps/emqx/src/emqx_session.erl | 9 +++++++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 13c35ed9b..43ca8bc02 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -146,16 +146,11 @@ del_subscription(IteratorID, TopicFilterBin, DSSessionID) -> begin TopicFilter = emqx_topic:words(TopicFilterBin), Ctx = #{iterator_id => IteratorID}, - case IteratorID of - undefined -> - ok; - _ -> - ?tp_span( - persistent_session_ds_close_iterators, - Ctx, - ok = ensure_iterator_closed_on_all_shards(IteratorID) - ) - end, + ?tp_span( + persistent_session_ds_close_iterators, + Ctx, + ok = ensure_iterator_closed_on_all_shards(IteratorID) + ), ?tp_span( persistent_session_ds_iterator_delete, Ctx, diff --git a/apps/emqx/src/emqx_session.erl b/apps/emqx/src/emqx_session.erl index 1ff4d9b85..859cee76b 100644 --- a/apps/emqx/src/emqx_session.erl +++ b/apps/emqx/src/emqx_session.erl @@ -357,8 +357,13 @@ unsubscribe( session(). remove_persistent_subscription(Session, TopicFilterBin, ClientId) -> Iterators = Session#session.iterators, - IteratorId = maps:get(TopicFilterBin, Iterators, undefined), - _ = emqx_persistent_session_ds:del_subscription(IteratorId, TopicFilterBin, ClientId), + case maps:get(TopicFilterBin, Iterators, undefined) of + undefined -> + ok; + IteratorId -> + _ = emqx_persistent_session_ds:del_subscription(IteratorId, TopicFilterBin, ClientId), + ok + end, Session#session{iterators = maps:remove(TopicFilterBin, Iterators)}. %%-------------------------------------------------------------------- From 23d63f5e0108c3bb87b4e543678030931f6e8110 Mon Sep 17 00:00:00 2001 From: Thales Macedo Garitezi Date: Mon, 4 Sep 2023 14:41:00 -0300 Subject: [PATCH 81/85] refactor: fork clientid type for `emqx_ds:session_id/0` --- apps/emqx/src/emqx_persistent_session_ds.erl | 6 +----- apps/emqx_durable_storage/src/emqx_ds.erl | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/apps/emqx/src/emqx_persistent_session_ds.erl b/apps/emqx/src/emqx_persistent_session_ds.erl index 43ca8bc02..83c2375f2 100644 --- a/apps/emqx/src/emqx_persistent_session_ds.erl +++ b/apps/emqx/src/emqx_persistent_session_ds.erl @@ -185,8 +185,7 @@ ensure_all_iterators_closed(DSSessionID) -> %% RPC target. -spec do_ensure_all_iterators_closed(emqx_ds:session_id()) -> ok. -do_ensure_all_iterators_closed(DSSessionID0) -> - DSSessionID = bin(DSSessionID0), +do_ensure_all_iterators_closed(DSSessionID) -> ok = emqx_ds_storage_layer:discard_iterator_prefix(?DS_SHARD, DSSessionID), ok. @@ -202,6 +201,3 @@ deserialize_message(Bin) -> is_store_enabled() -> emqx_config:get([persistent_session_store, ds]). - -bin(B) when is_binary(B) -> B; -bin(A) when is_atom(A) -> atom_to_binary(A, utf8). diff --git a/apps/emqx_durable_storage/src/emqx_ds.erl b/apps/emqx_durable_storage/src/emqx_ds.erl index 095a93745..0a61cad43 100644 --- a/apps/emqx_durable_storage/src/emqx_ds.erl +++ b/apps/emqx_durable_storage/src/emqx_ds.erl @@ -58,7 +58,9 @@ %% Type declarations %%================================================================================ --type session_id() :: emqx_types:clientid(). +%% Currently, this is the clientid. We avoid `emqx_types:clientid()' because that can be +%% an atom, in theory (?). +-type session_id() :: binary(). -type iterator() :: term(). From 01a128878f03a06d7b533877df71ea2ba69da83f Mon Sep 17 00:00:00 2001 From: ieQu1 <99872536+ieQu1@users.noreply.github.com> Date: Tue, 5 Sep 2023 11:04:12 +0200 Subject: [PATCH 82/85] chore(mria): Bump mria to 0.6.1 --- apps/emqx/rebar.config | 2 +- changes/ce/fix-11564.en.md | 2 ++ mix.exs | 2 +- rebar.config | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 changes/ce/fix-11564.en.md diff --git a/apps/emqx/rebar.config b/apps/emqx/rebar.config index eb0465db0..a404e5d81 100644 --- a/apps/emqx/rebar.config +++ b/apps/emqx/rebar.config @@ -28,7 +28,7 @@ {gproc, {git, "https://github.com/emqx/gproc", {tag, "0.9.0.1"}}}, {cowboy, {git, "https://github.com/emqx/cowboy", {tag, "2.9.2"}}}, {esockd, {git, "https://github.com/emqx/esockd", {tag, "5.9.6"}}}, - {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.15.10"}}}, + {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.15.11"}}}, {gen_rpc, {git, "https://github.com/emqx/gen_rpc", {tag, "2.8.1"}}}, {hocon, {git, "https://github.com/emqx/hocon.git", {tag, "0.39.16"}}}, {emqx_http_lib, {git, "https://github.com/emqx/emqx_http_lib.git", {tag, "0.5.3"}}}, diff --git a/changes/ce/fix-11564.en.md b/changes/ce/fix-11564.en.md new file mode 100644 index 000000000..cf6aa28cb --- /dev/null +++ b/changes/ce/fix-11564.en.md @@ -0,0 +1,2 @@ +Fix cluster partition autoheal functionality. +Implement autohealing for the clusters that split into multiple partitions. diff --git a/mix.exs b/mix.exs index 225f1753b..e3916245c 100644 --- a/mix.exs +++ b/mix.exs @@ -55,7 +55,7 @@ defmodule EMQXUmbrella.MixProject do {:cowboy, github: "emqx/cowboy", tag: "2.9.2", override: true}, {:esockd, github: "emqx/esockd", tag: "5.9.6", override: true}, {:rocksdb, github: "emqx/erlang-rocksdb", tag: "1.8.0-emqx-1", override: true}, - {:ekka, github: "emqx/ekka", tag: "0.15.10", override: true}, + {:ekka, github: "emqx/ekka", tag: "0.15.11", override: true}, {:gen_rpc, github: "emqx/gen_rpc", tag: "2.8.1", override: true}, {:grpc, github: "emqx/grpc-erl", tag: "0.6.8", override: true}, {:minirest, github: "emqx/minirest", tag: "1.3.11", override: true}, diff --git a/rebar.config b/rebar.config index dae8319e0..450be64b3 100644 --- a/rebar.config +++ b/rebar.config @@ -62,7 +62,7 @@ , {cowboy, {git, "https://github.com/emqx/cowboy", {tag, "2.9.2"}}} , {esockd, {git, "https://github.com/emqx/esockd", {tag, "5.9.6"}}} , {rocksdb, {git, "https://github.com/emqx/erlang-rocksdb", {tag, "1.8.0-emqx-1"}}} - , {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.15.10"}}} + , {ekka, {git, "https://github.com/emqx/ekka", {tag, "0.15.11"}}} , {gen_rpc, {git, "https://github.com/emqx/gen_rpc", {tag, "2.8.1"}}} , {grpc, {git, "https://github.com/emqx/grpc-erl", {tag, "0.6.8"}}} , {minirest, {git, "https://github.com/emqx/minirest", {tag, "1.3.11"}}} From 916306b6ba33bb49bf433a8698bf0188eba823de Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 6 Sep 2023 09:56:36 +0200 Subject: [PATCH 83/85] chore: delete beta release feature changelog --- changes/ee/feat-11447.en.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 changes/ee/feat-11447.en.md diff --git a/changes/ee/feat-11447.en.md b/changes/ee/feat-11447.en.md deleted file mode 100644 index caa808861..000000000 --- a/changes/ee/feat-11447.en.md +++ /dev/null @@ -1 +0,0 @@ -Added CLI command to wipe session and retained message data on the whole cluster. From 18c6bfec97295d172342e81388e6b704398fa3a9 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 6 Sep 2023 11:12:56 +0200 Subject: [PATCH 84/85] chore: bump app vsns --- apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.app.src | 2 +- apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src | 2 +- apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.app.src | 2 +- apps/emqx_gateway/src/emqx_gateway.app.src | 2 +- apps/emqx_ldap/src/emqx_ldap.app.src | 2 +- apps/emqx_machine/src/emqx_machine.app.src | 2 +- apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src | 2 +- apps/emqx_rule_engine/src/emqx_rule_engine.app.src | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.app.src b/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.app.src index c7dcea5c0..9afc0f05e 100644 --- a/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.app.src +++ b/apps/emqx_bridge_gcp_pubsub/src/emqx_bridge_gcp_pubsub.app.src @@ -1,6 +1,6 @@ {application, emqx_bridge_gcp_pubsub, [ {description, "EMQX Enterprise GCP Pub/Sub Bridge"}, - {vsn, "0.1.7"}, + {vsn, "0.1.8"}, {registered, []}, {applications, [ kernel, diff --git a/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src b/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src index 55b02560b..835932ddb 100644 --- a/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src +++ b/apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_bridge_kafka, [ {description, "EMQX Enterprise Kafka Bridge"}, - {vsn, "0.1.8"}, + {vsn, "0.1.9"}, {registered, [emqx_bridge_kafka_consumer_sup]}, {applications, [ kernel, diff --git a/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.app.src b/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.app.src index 3eb923b5d..6066e2495 100644 --- a/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.app.src +++ b/apps/emqx_bridge_kinesis/src/emqx_bridge_kinesis.app.src @@ -1,6 +1,6 @@ {application, emqx_bridge_kinesis, [ {description, "EMQX Enterprise Amazon Kinesis Bridge"}, - {vsn, "0.1.1"}, + {vsn, "0.1.2"}, {registered, []}, {applications, [ kernel, diff --git a/apps/emqx_gateway/src/emqx_gateway.app.src b/apps/emqx_gateway/src/emqx_gateway.app.src index 582269ce6..47899ceeb 100644 --- a/apps/emqx_gateway/src/emqx_gateway.app.src +++ b/apps/emqx_gateway/src/emqx_gateway.app.src @@ -1,7 +1,7 @@ %% -*- mode: erlang -*- {application, emqx_gateway, [ {description, "The Gateway management application"}, - {vsn, "0.1.23"}, + {vsn, "0.1.24"}, {registered, []}, {mod, {emqx_gateway_app, []}}, {applications, [kernel, stdlib, emqx, emqx_authn, emqx_ctl]}, diff --git a/apps/emqx_ldap/src/emqx_ldap.app.src b/apps/emqx_ldap/src/emqx_ldap.app.src index 7a252dd33..152a7b6a9 100644 --- a/apps/emqx_ldap/src/emqx_ldap.app.src +++ b/apps/emqx_ldap/src/emqx_ldap.app.src @@ -1,6 +1,6 @@ {application, emqx_ldap, [ {description, "EMQX LDAP Connector"}, - {vsn, "0.1.1"}, + {vsn, "0.1.2"}, {registered, []}, {applications, [ kernel, diff --git a/apps/emqx_machine/src/emqx_machine.app.src b/apps/emqx_machine/src/emqx_machine.app.src index dd1915cfb..813d41e5b 100644 --- a/apps/emqx_machine/src/emqx_machine.app.src +++ b/apps/emqx_machine/src/emqx_machine.app.src @@ -3,7 +3,7 @@ {id, "emqx_machine"}, {description, "The EMQX Machine"}, % strict semver, bump manually! - {vsn, "0.2.12"}, + {vsn, "0.2.13"}, {modules, []}, {registered, []}, {applications, [kernel, stdlib, emqx_ctl]}, diff --git a/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src b/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src index 7202b24c8..d459fc107 100644 --- a/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src +++ b/apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src @@ -1,6 +1,6 @@ {application, emqx_opentelemetry, [ {description, "OpenTelemetry for EMQX Broker"}, - {vsn, "0.1.1"}, + {vsn, "0.1.2"}, {registered, []}, {mod, {emqx_otel_app, []}}, {applications, [kernel, stdlib, emqx]}, diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src index e6d00bcae..23e4a3f05 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src @@ -2,7 +2,7 @@ {application, emqx_rule_engine, [ {description, "EMQX Rule Engine"}, % strict semver, bump manually! - {vsn, "5.0.23"}, + {vsn, "5.0.24"}, {modules, []}, {registered, [emqx_rule_engine_sup, emqx_rule_engine]}, {applications, [kernel, stdlib, rulesql, getopt, emqx_ctl, uuid]}, From 7c2f87fabee0cce025417a3d52e2a988d1af931f Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Wed, 6 Sep 2023 21:36:16 +0200 Subject: [PATCH 85/85] test: merge broker and router boot modules --- apps/emqx/src/emqx_boot.erl | 4 ++-- apps/emqx/src/emqx_sup.erl | 2 +- apps/emqx/test/emqx_access_control_SUITE.erl | 2 +- apps/emqx/test/emqx_boot_SUITE.erl | 11 ++--------- apps/emqx/test/emqx_router_SUITE.erl | 2 +- apps/emqx/test/emqx_router_helper_SUITE.erl | 4 ++-- .../test/emqx_bridge_gcp_pubsub_consumer_SUITE.erl | 2 +- .../test/emqx_bridge_kafka_impl_consumer_SUITE.erl | 2 +- .../test/emqx_bridge_pulsar_impl_producer_SUITE.erl | 2 +- .../test/emqx_mgmt_data_backup_SUITE.erl | 2 +- .../test/emqx_schema_registry_SUITE.erl | 2 +- 11 files changed, 14 insertions(+), 21 deletions(-) diff --git a/apps/emqx/src/emqx_boot.erl b/apps/emqx/src/emqx_boot.erl index 6cbac558f..b3dfcda33 100644 --- a/apps/emqx/src/emqx_boot.erl +++ b/apps/emqx/src/emqx_boot.erl @@ -18,9 +18,9 @@ -export([is_enabled/1]). --define(BOOT_MODULES, [router, broker, listeners]). +-define(BOOT_MODULES, [broker, listeners]). --spec is_enabled(all | router | broker | listeners) -> boolean(). +-spec is_enabled(all | broker | listeners) -> boolean(). is_enabled(Mod) -> (BootMods = boot_modules()) =:= all orelse lists:member(Mod, BootMods). diff --git a/apps/emqx/src/emqx_sup.erl b/apps/emqx/src/emqx_sup.erl index 8c79e7482..1893dba86 100644 --- a/apps/emqx/src/emqx_sup.erl +++ b/apps/emqx/src/emqx_sup.erl @@ -74,7 +74,7 @@ init([]) -> Children = [KernelSup] ++ [SessionSup || emqx_persistent_session:is_store_enabled()] ++ - [RouterSup || emqx_boot:is_enabled(router)] ++ + [RouterSup || emqx_boot:is_enabled(broker)] ++ [BrokerSup || emqx_boot:is_enabled(broker)] ++ [CMSup || emqx_boot:is_enabled(broker)] ++ [SysSup, Limiter], diff --git a/apps/emqx/test/emqx_access_control_SUITE.erl b/apps/emqx/test/emqx_access_control_SUITE.erl index 5d4344de6..8f7a1fa6c 100644 --- a/apps/emqx/test/emqx_access_control_SUITE.erl +++ b/apps/emqx/test/emqx_access_control_SUITE.erl @@ -26,7 +26,7 @@ all() -> emqx_common_test_helpers:all(?MODULE). init_per_suite(Config) -> - emqx_common_test_helpers:boot_modules([router, broker]), + emqx_common_test_helpers:boot_modules([broker]), emqx_common_test_helpers:start_apps([]), Config. diff --git a/apps/emqx/test/emqx_boot_SUITE.erl b/apps/emqx/test/emqx_boot_SUITE.erl index 06f08afb8..006888d9c 100644 --- a/apps/emqx/test/emqx_boot_SUITE.erl +++ b/apps/emqx/test/emqx_boot_SUITE.erl @@ -26,19 +26,12 @@ all() -> emqx_common_test_helpers:all(?MODULE). t_is_enabled(_) -> try ok = application:set_env(emqx, boot_modules, all), - ?assert(emqx_boot:is_enabled(router)), ?assert(emqx_boot:is_enabled(broker)), ?assert(emqx_boot:is_enabled(listeners)), - ok = application:set_env(emqx, boot_modules, [router]), - ?assert(emqx_boot:is_enabled(router)), - ?assertNot(emqx_boot:is_enabled(broker)), - ?assertNot(emqx_boot:is_enabled(listeners)), - ok = application:set_env(emqx, boot_modules, [router, broker]), - ?assert(emqx_boot:is_enabled(router)), + ok = application:set_env(emqx, boot_modules, [broker]), ?assert(emqx_boot:is_enabled(broker)), ?assertNot(emqx_boot:is_enabled(listeners)), - ok = application:set_env(emqx, boot_modules, [router, broker, listeners]), - ?assert(emqx_boot:is_enabled(router)), + ok = application:set_env(emqx, boot_modules, [broker, listeners]), ?assert(emqx_boot:is_enabled(broker)), ?assert(emqx_boot:is_enabled(listeners)) after diff --git a/apps/emqx/test/emqx_router_SUITE.erl b/apps/emqx/test/emqx_router_SUITE.erl index 1128112ff..9729d8ddc 100644 --- a/apps/emqx/test/emqx_router_SUITE.erl +++ b/apps/emqx/test/emqx_router_SUITE.erl @@ -44,7 +44,7 @@ init_per_group(GroupName, Config) -> AppSpecs = [ {emqx, #{ config => mk_config(GroupName), - override_env => [{boot_modules, [router]}] + override_env => [{boot_modules, [broker]}] }} ], Apps = emqx_cth_suite:start(AppSpecs, #{work_dir => WorkDir}), diff --git a/apps/emqx/test/emqx_router_helper_SUITE.erl b/apps/emqx/test/emqx_router_helper_SUITE.erl index 889c8293c..8fe052af8 100644 --- a/apps/emqx/test/emqx_router_helper_SUITE.erl +++ b/apps/emqx/test/emqx_router_helper_SUITE.erl @@ -51,12 +51,12 @@ end_per_group(_GroupName, Config) -> mk_config(routing_schema_v1) -> #{ config => "broker.routing.storage_schema = v1", - override_env => [{boot_modules, [router]}] + override_env => [{boot_modules, [broker]}] }; mk_config(routing_schema_v2) -> #{ config => "broker.routing.storage_schema = v2", - override_env => [{boot_modules, [router]}] + override_env => [{boot_modules, [broker]}] }. init_per_testcase(_TestCase, Config) -> diff --git a/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_consumer_SUITE.erl b/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_consumer_SUITE.erl index 8dc6cd7c4..60c54ebda 100644 --- a/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_consumer_SUITE.erl +++ b/apps/emqx_bridge_gcp_pubsub/test/emqx_bridge_gcp_pubsub_consumer_SUITE.erl @@ -577,7 +577,7 @@ cluster(Config) -> {schema_mod, emqx_enterprise_schema}, {env_handler, fun (emqx) -> - application:set_env(emqx, boot_modules, [broker, router]), + application:set_env(emqx, boot_modules, [broker]), ok; (emqx_conf) -> ok; diff --git a/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl b/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl index 1691fa6a4..60a571b2d 100644 --- a/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl +++ b/apps/emqx_bridge_kafka/test/emqx_bridge_kafka_impl_consumer_SUITE.erl @@ -1101,7 +1101,7 @@ cluster(Config) -> {load_apps, [emqx_machine]}, {env_handler, fun (emqx) -> - application:set_env(emqx, boot_modules, [broker, router]), + application:set_env(emqx, boot_modules, [broker]), ExtraEnvHandlerHook(), ok; (emqx_conf) -> diff --git a/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl b/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl index fb358906f..44d28c31a 100644 --- a/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl +++ b/apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_impl_producer_SUITE.erl @@ -536,7 +536,7 @@ cluster(Config) -> {schema_mod, emqx_enterprise_schema}, {env_handler, fun (emqx) -> - application:set_env(emqx, boot_modules, [broker, router]), + application:set_env(emqx, boot_modules, [broker]), ok; (emqx_conf) -> ok; diff --git a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl index 04982a087..7cb2c9cf1 100644 --- a/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_data_backup_SUITE.erl @@ -432,7 +432,7 @@ create_test_tab(Attributes) -> apps_to_start() -> [ - {emqx, #{override_env => [{boot_modules, [broker, router]}]}}, + {emqx, #{override_env => [{boot_modules, [broker]}]}}, {emqx_conf, #{config => #{dashboard => #{listeners => #{http => #{bind => <<"0">>}}}}}}, emqx_psk, emqx_management, diff --git a/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl b/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl index e2a696428..7aea09457 100644 --- a/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl +++ b/apps/emqx_schema_registry/test/emqx_schema_registry_SUITE.erl @@ -368,7 +368,7 @@ cluster(Config) -> {load_apps, [emqx_machine]}, {env_handler, fun (emqx) -> - application:set_env(emqx, boot_modules, [broker, router]), + application:set_env(emqx, boot_modules, [broker]), ok; (emqx_conf) -> ok;