From 9064b5acb8248599323b7f7fa8fc4e789fb700a3 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Thu, 28 Jul 2022 15:42:57 +0300 Subject: [PATCH 1/3] chore(ci): make apps-version-check.sh accept new apps --- scripts/apps-version-check.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/apps-version-check.sh b/scripts/apps-version-check.sh index ffef46c87..2cb05d3ed 100755 --- a/scripts/apps-version-check.sh +++ b/scripts/apps-version-check.sh @@ -19,6 +19,14 @@ while read -r app; do app_path="." fi src_file="$app_path/src/$(basename "$app").app.src" + + old_app_exists=0 + git show "$latest_release":"$src_file" >/dev/null 2>&1 || old_app_exists="$?" + if [ "$old_app_exists" != "0" ]; then + echo "$app is new, skipping version check" + continue + fi + old_app_version="$(git show "$latest_release":"$src_file" | grep vsn | grep -oE '"[0-9]+\.[0-9]+\.[0-9]+"' | tr -d '"')" now_app_version=$(grep -E 'vsn' "$src_file" | grep -oE '"[0-9]+\.[0-9]+\.[0-9]+"' | tr -d '"') if [ "$old_app_version" = "$now_app_version" ]; then @@ -48,7 +56,7 @@ while read -r app; do now_app_version_semver=($(parse_semver "$now_app_version")) if [ "${old_app_version_semver[0]}" = "${now_app_version_semver[0]}" ] && \ [ "${old_app_version_semver[1]}" = "${now_app_version_semver[1]}" ] && \ - [ "$(( "${old_app_version_semver[2]}" + 1 ))" = "${now_app_version_semver[2]}" ]; then + [ "$(( old_app_version_semver[2] + 1 ))" = "${now_app_version_semver[2]}" ]; then true else echo "$src_file: non-strict semver version bump from $old_app_version to $now_app_version" From a19fbe214f2971e9971185e705cbff7296101087 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Thu, 28 Apr 2022 15:58:17 +0300 Subject: [PATCH 2/3] feat(node_rebalance): implement node evacuation and rebalancing --- CHANGES-4.3.md | 2 + apps/emqx_eviction_agent/.gitignore | 19 + apps/emqx_eviction_agent/README.md | 9 + .../etc/emqx_eviction_agent.conf | 3 + .../priv/emqx_eviction_agent.schema | 0 apps/emqx_eviction_agent/rebar.config | 2 + .../src/emqx_eviction_agent.app.src | 18 + .../src/emqx_eviction_agent.erl | 291 ++++++++++++ .../src/emqx_eviction_agent_api.erl | 36 ++ .../src/emqx_eviction_agent_app.erl | 36 ++ .../src/emqx_eviction_agent_channel.erl | 299 +++++++++++++ .../src/emqx_eviction_agent_cli.erl | 42 ++ .../src/emqx_eviction_agent_conn_sup.erl | 33 ++ .../src/emqx_eviction_agent_sup.erl | 43 ++ .../test/emqx_eviction_agent_SUITE.erl | 232 ++++++++++ .../test/emqx_eviction_agent_api_SUITE.erl | 64 +++ .../emqx_eviction_agent_channel_SUITE.erl | 155 +++++++ .../test/emqx_eviction_agent_cli_SUITE.erl | 47 ++ .../test/emqx_eviction_agent_test_helpers.erl | 55 +++ .../test/emqx_mgmt_api_SUITE.erl | 56 +-- .../test/emqx_mgmt_api_test_helpers.erl | 69 +++ apps/emqx_node_rebalance/.gitignore | 19 + apps/emqx_node_rebalance/README.md | 9 + .../etc/emqx_node_rebalance.conf | 3 + .../include/emqx_node_rebalance.hrl | 31 ++ .../priv/emqx_node_rebalance.schema | 0 apps/emqx_node_rebalance/rebar.config | 2 + .../src/emqx_node_rebalance.app.src | 19 + .../src/emqx_node_rebalance.erl | 414 ++++++++++++++++++ .../src/emqx_node_rebalance_agent.erl | 127 ++++++ .../src/emqx_node_rebalance_api.erl | 243 ++++++++++ .../src/emqx_node_rebalance_app.erl | 34 ++ .../src/emqx_node_rebalance_cli.erl | 265 +++++++++++ .../src/emqx_node_rebalance_evacuation.erl | 298 +++++++++++++ ...emqx_node_rebalance_evacuation_persist.erl | 109 +++++ .../src/emqx_node_rebalance_status.erl | 225 ++++++++++ .../src/emqx_node_rebalance_sup.erl | 44 ++ .../test/emqx_node_rebalance_SUITE.erl | 183 ++++++++ .../test/emqx_node_rebalance_agent_SUITE.erl | 163 +++++++ .../test/emqx_node_rebalance_api_SUITE.erl | 321 ++++++++++++++ .../test/emqx_node_rebalance_cli_SUITE.erl | 199 +++++++++ .../emqx_node_rebalance_evacuation_SUITE.erl | 194 ++++++++ ...ode_rebalance_evacuation_persist_SUITE.erl | 107 +++++ .../src/emqx_rule_engine.app.src | 2 +- .../src/emqx_rule_engine.appup.src | 6 +- data/loaded_plugins.tmpl | 2 + .../emqx_dashboard/src/emqx_dashboard.app.src | 2 +- rebar.config.erl | 4 + src/emqx.app.src | 2 +- src/emqx.appup.src | 46 +- src/emqx_channel.erl | 20 +- src/emqx_cm.erl | 46 +- src/emqx_plugins.erl | 2 + test/emqx_node_helpers.erl | 83 ++++ test/emqx_plugins_SUITE.erl | 2 + test/emqx_shared_sub_SUITE.erl | 61 +-- 56 files changed, 4668 insertions(+), 130 deletions(-) create mode 100644 apps/emqx_eviction_agent/.gitignore create mode 100644 apps/emqx_eviction_agent/README.md create mode 100644 apps/emqx_eviction_agent/etc/emqx_eviction_agent.conf create mode 100644 apps/emqx_eviction_agent/priv/emqx_eviction_agent.schema create mode 100644 apps/emqx_eviction_agent/rebar.config create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent.erl create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent_api.erl create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent_app.erl create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl create mode 100644 apps/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl create mode 100644 apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl create mode 100644 apps/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl create mode 100644 apps/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl create mode 100644 apps/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl create mode 100644 apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl create mode 100644 apps/emqx_management/test/emqx_mgmt_api_test_helpers.erl create mode 100644 apps/emqx_node_rebalance/.gitignore create mode 100644 apps/emqx_node_rebalance/README.md create mode 100644 apps/emqx_node_rebalance/etc/emqx_node_rebalance.conf create mode 100644 apps/emqx_node_rebalance/include/emqx_node_rebalance.hrl create mode 100644 apps/emqx_node_rebalance/priv/emqx_node_rebalance.schema create mode 100644 apps/emqx_node_rebalance/rebar.config create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_app.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl create mode 100644 apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl create mode 100644 apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl create mode 100644 test/emqx_node_helpers.erl diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index 1345ddc65..10304fc70 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -21,6 +21,7 @@ File format: - Improve error message for LwM2M plugin when object ID is not valid [#8654](https://github.com/emqx/emqx/pull/8654). - Add tzdata apk package to alpine docker image. [#8671](https://github.com/emqx/emqx/pull/8671) +- Add node evacuation and cluster rebalancing features [#8597] ## v4.3.19 @@ -55,6 +56,7 @@ File format: - HTTP API(GET /rules/) support for pagination and fuzzy filtering. [#8450] - Add check_conf cli to check config format. [#8486] - Optimize performance of shared subscription +- Make possible to debug-print SSL handshake procedure by setting listener config `log_level=debug` [#8553](https://github.com/emqx/emqx/pull/8553) ## v4.3.16 diff --git a/apps/emqx_eviction_agent/.gitignore b/apps/emqx_eviction_agent/.gitignore new file mode 100644 index 000000000..f1c455451 --- /dev/null +++ b/apps/emqx_eviction_agent/.gitignore @@ -0,0 +1,19 @@ +.rebar3 +_* +.eunit +*.o +*.beam +*.plt +*.swp +*.swo +.erlang.cookie +ebin +log +erl_crash.dump +.rebar +logs +_build +.idea +*.iml +rebar3.crashdump +*~ diff --git a/apps/emqx_eviction_agent/README.md b/apps/emqx_eviction_agent/README.md new file mode 100644 index 000000000..f9b8037bf --- /dev/null +++ b/apps/emqx_eviction_agent/README.md @@ -0,0 +1,9 @@ +emqx_eviction_agent +===== + +An OTP library + +Build +----- + + $ rebar3 compile diff --git a/apps/emqx_eviction_agent/etc/emqx_eviction_agent.conf b/apps/emqx_eviction_agent/etc/emqx_eviction_agent.conf new file mode 100644 index 000000000..011b7fb0f --- /dev/null +++ b/apps/emqx_eviction_agent/etc/emqx_eviction_agent.conf @@ -0,0 +1,3 @@ +##-------------------------------------------------------------------- +## EMQX Eviction Agent Plugin +##-------------------------------------------------------------------- diff --git a/apps/emqx_eviction_agent/priv/emqx_eviction_agent.schema b/apps/emqx_eviction_agent/priv/emqx_eviction_agent.schema new file mode 100644 index 000000000..e69de29bb diff --git a/apps/emqx_eviction_agent/rebar.config b/apps/emqx_eviction_agent/rebar.config new file mode 100644 index 000000000..2656fd554 --- /dev/null +++ b/apps/emqx_eviction_agent/rebar.config @@ -0,0 +1,2 @@ +{erl_opts, [debug_info]}. +{deps, []}. diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src new file mode 100644 index 000000000..ffa28af61 --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src @@ -0,0 +1,18 @@ +{application, emqx_eviction_agent, + [{description, "EMQX Eviction Agent"}, + {vsn, "4.3.0"}, + {registered, [emqx_eviction_agent_sup, + emqx_eviction_agent, + emqx_eviction_agent_conn_sup]}, + {applications, + [kernel, + stdlib + ]}, + {mod, {emqx_eviction_agent_app,[]}}, + {env,[]}, + {modules, []}, + {maintainers, ["EMQX Team "]}, + {links, [{"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx"} + ]} + ]}. diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl new file mode 100644 index 000000000..a8cf442cf --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent.erl @@ -0,0 +1,291 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([start_link/0, + enable/2, + disable/1, + status/0, + connection_count/0, + session_count/0, + session_count/1, + evict_connections/1, + evict_sessions/2, + evict_sessions/3, + evict_session_channel/3 + ]). + +-behaviour(gen_server). + +-export([init/1, + handle_call/3, + handle_info/2, + handle_cast/2, + code_change/3 + ]). + +-export([on_connect/2, + on_connack/3]). + +-export([hook/0, + unhook/0]). + +-export_type([server_reference/0]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type server_reference() :: binary() | undefined. +-type status() :: {enabled, conn_stats()} | disabled. +-type conn_stats() :: #{connections := non_neg_integer(), + sessions := non_neg_integer()}. +-type kind() :: atom(). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec enable(kind(), server_reference()) -> ok_or_error(eviction_agent_busy). +enable(Kind, ServerReference) -> + gen_server:call(?MODULE, {enable, Kind, ServerReference}). + +-spec disable(kind()) -> ok. +disable(Kind) -> + gen_server:call(?MODULE, {disable, Kind}). + +-spec status() -> status(). +status() -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + {enabled, stats()}; + disabled -> + disabled + end. + +-spec evict_connections(pos_integer()) -> ok_or_error(disabled). +evict_connections(N) -> + case enable_status() of + {enabled, _Kind, ServerReference} -> + ok = do_evict_connections(N, ServerReference); + disabled -> + {error, disabled} + end. + +-spec evict_sessions(pos_integer(), node() | [node()]) -> ok_or_error(disabled). +evict_sessions(N, Node) when is_atom(Node) -> + evict_sessions(N, [Node]); +evict_sessions(N, Nodes) when is_list(Nodes) andalso length(Nodes) > 0 -> + evict_sessions(N, Nodes, any). + +-spec evict_sessions(pos_integer(), node() | [node()], atom()) -> ok_or_error(disabled). +evict_sessions(N, Node, ConnState) when is_atom(Node) -> + evict_sessions(N, [Node], ConnState); +evict_sessions(N, Nodes, ConnState) + when is_list(Nodes) andalso length(Nodes) > 0 -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + ok = do_evict_sessions(N, Nodes, ConnState); + disabled -> + {error, disabled} + end. + +%%-------------------------------------------------------------------- +%% gen_server callbacks +%%-------------------------------------------------------------------- + +init([]) -> + _ = persistent_term:erase(?MODULE), + {ok, #{}}. + +%% enable +handle_call({enable, Kind, ServerReference}, _From, St) -> + Reply = case enable_status() of + disabled -> + ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference}); + {enabled, Kind, _ServerReference} -> + ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference}); + {enabled, _OtherKind, _ServerReference} -> + {error, eviction_agent_busy} + end, + {reply, Reply, St}; + +%% disable +handle_call({disable, Kind}, _From, St) -> + Reply = case enable_status() of + disabled -> + {error, disabled}; + {enabled, Kind, _ServerReference} -> + _ = persistent_term:erase(?MODULE), + ok; + {enabled, _OtherKind, _ServerReference} -> + {error, eviction_agent_busy} + end, + {reply, Reply, St}. + +handle_info(Msg, St) -> + ?LOG(warning, "Unknown Msg: ~p, State: ~p", [Msg, St]), + {noreply, St}. + +handle_cast(Msg, St) -> + ?LOG(warning, "Unknown cast Msg: ~p, State: ~p", [Msg, St]), + {noreply, St}. + +code_change(_Vsn, State, _Extra) -> + {ok, State}. + +%%-------------------------------------------------------------------- +%% Hook callbacks +%%-------------------------------------------------------------------- + +on_connect(_ConnInfo, _Props) -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + {stop, {error, ?RC_USE_ANOTHER_SERVER}}; + disabled -> + ignore + end. + +on_connack(#{proto_name := <<"MQTT">>, proto_ver := ?MQTT_PROTO_V5}, + use_another_server, + Props) -> + case enable_status() of + {enabled, _Kind, ServerReference} -> + {ok, Props#{'Server-Reference' => ServerReference}}; + disabled -> + {ok, Props} + end; +on_connack(_ClientInfo, _Reason, Props) -> + {ok, Props}. + +%%-------------------------------------------------------------------- +%% Hook funcs +%%-------------------------------------------------------------------- + +hook() -> + ?tp(debug, eviction_agent_hook, #{}), + ok = emqx_hooks:put('client.connack', {?MODULE, on_connack, []}), + ok = emqx_hooks:put('client.connect', {?MODULE, on_connect, []}). + +unhook() -> + ?tp(debug, eviction_agent_unhook, #{}), + ok = emqx_hooks:del('client.connect', {?MODULE, on_connect}), + ok = emqx_hooks:del('client.connack', {?MODULE, on_connack}). + +enable_status() -> + persistent_term:get(?MODULE, disabled). + +% connection management +stats() -> + #{ + connections => connection_count(), + sessions => session_count() + }. + +connection_table() -> + emqx_cm:live_connection_table(). + +connection_count() -> + table_count(connection_table()). + +channel_with_session_table(any) -> + qlc:q([{ClientId, ConnInfo, ClientInfo} + || {ClientId, _, ConnInfo, ClientInfo} <- emqx_cm:channel_with_session_table()]); +channel_with_session_table(RequiredConnState) -> + qlc:q([{ClientId, ConnInfo, ClientInfo} + || {ClientId, ConnState, ConnInfo, ClientInfo} <- emqx_cm:channel_with_session_table(), + RequiredConnState =:= ConnState]). + +session_count() -> + session_count(any). + +session_count(ConnState) -> + table_count(channel_with_session_table(ConnState)). + +table_count(QH) -> + qlc:fold(fun(_, Acc) -> Acc + 1 end, 0, QH). + +take_connections(N) -> + ChanQH = qlc:q([ChanPid || {_ClientId, ChanPid} <- connection_table()]), + ChanPidCursor = qlc:cursor(ChanQH), + ChanPids = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + ChanPids. + +take_channel_with_sessions(N, ConnState) -> + ChanPidCursor = qlc:cursor(channel_with_session_table(ConnState)), + Channels = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + Channels. + +do_evict_connections(N, ServerReference) when N > 0 -> + ChanPids = take_connections(N), + ok = lists:foreach( + fun(ChanPid) -> + disconnect_channel(ChanPid, ServerReference) + end, + ChanPids). + +do_evict_sessions(N, Nodes, ConnState) when N > 0 -> + Channels = take_channel_with_sessions(N, ConnState), + ok = lists:foreach( + fun({ClientId, ConnInfo, ClientInfo}) -> + evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) + end, + Channels). + +evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) -> + Node = select_random(Nodes), + ?LOG(info, "Evicting client=~p to node=~p, conninfo=~p, clientinfo=~p", + [ClientId, Node, ConnInfo, ClientInfo]), + case rpc:call(Node, ?MODULE, evict_session_channel, [ClientId, ConnInfo, ClientInfo]) of + {badrpc, Reason} -> + ?LOG(error, "RPC error while evicting client=~p to node=~p: ~p", + [ClientId, Node, Reason]), + {error, Reason}; + {error, Reason} = Error -> + ?LOG(error, "Error evicting client=~p to node=~p: ~p", + [ClientId, Node, Reason]), + Error; + Res -> Res + end. + +evict_session_channel(ClientId, ConnInfo, ClientInfo) -> + ?LOG(info, "Taking up client=~p, conninfo=~p, clientinfo=~p", + [ClientId, ConnInfo, ClientInfo]), + Result = emqx_eviction_agent_channel:start_supervised( + #{conninfo => ConnInfo, + clientinfo => ClientInfo}), + ?LOG(info, "Taking up client=~p, result=~p", + [ClientId, Result]), + Result. + +disconnect_channel(ChanPid, ServerReference) -> + ChanPid ! {disconnect, + ?RC_USE_ANOTHER_SERVER, + use_another_server, + #{'Server-Reference' => ServerReference}}. + +select_random(List) when length(List) > 0 -> + lists:nth(rand:uniform(length(List)) , List). diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent_api.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_api.erl new file mode 100644 index 000000000..426f5978d --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent_api.erl @@ -0,0 +1,36 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_api). + +-include_lib("emqx/include/logger.hrl"). + +-rest_api(#{name => node_eviction_status, + method => 'GET', + path => "/node_eviction/status", + func => status, + descr => "Get node eviction status"}). + +-export([status/2]). + +status(_Bindings, _Params) -> + case emqx_eviction_agent:status() of + disabled -> + {ok, #{status => disabled}}; + {enabled, Stats} -> + {ok, #{status => enabled, + stats => Stats}} + end. diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent_app.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_app.erl new file mode 100644 index 000000000..c13fcfb0d --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent_app.erl @@ -0,0 +1,36 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_app). + +-behaviour(application). + +-emqx_plugin(?MODULE). + +-export([ start/2 + , stop/1 + ]). + +start(_Type, _Args) -> + Env = application:get_all_env(emqx_eviction_agent), + ok = emqx_eviction_agent:hook(), + {ok, Sup} = emqx_eviction_agent_sup:start_link(Env), + ok = emqx_eviction_agent_cli:load(), + {ok, Sup}. + +stop(_State) -> + ok = emqx_eviction_agent:unhook(), + ok = emqx_eviction_agent_cli:unload(). diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl new file mode 100644 index 000000000..1f6fad00f --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl @@ -0,0 +1,299 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% MQTT Channel +-module(emqx_eviction_agent_channel). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-logger_header("[Evicted Channel]"). + +-export([start_link/1, + start_supervised/1, + call/2, + call/3, + cast/2, + stop/1 + ]). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3 + ]). + +-type opts() :: #{conninfo := emqx_types:conninfo(), + clientinfo := emqx_types:clientinfo()}. + +%%-------------------------------------------------------------------- +%% API +%%-------------------------------------------------------------------- + +-spec start_supervised(opts()) -> startlink_ret(). +start_supervised(#{clientinfo := #{clientid := ClientId}} = Opts) -> + RandomId = integer_to_binary(erlang:unique_integer([positive])), + Id = <>, + ChildSpec = #{id => Id, + start => {?MODULE, start_link, [Opts]}, + restart => temporary, + shutdown => 5000, + type => worker, + modules => [?MODULE] + }, + supervisor:start_child( + emqx_eviction_agent_conn_sup, + ChildSpec). + +-spec start_link(opts()) -> startlink_ret(). +start_link(Opts) -> + gen_server:start_link(?MODULE, [Opts], []). + +-spec cast(pid(), term()) -> ok. +cast(Pid, Req) -> + gen_server:cast(Pid, Req). + +-spec call(pid(), term()) -> term(). +call(Pid, Req) -> + call(Pid, Req, infinity). + +-spec call(pid(), term(), timeout()) -> term(). +call(Pid, Req, Timeout) -> + gen_server:call(Pid, Req, Timeout). + +-spec stop(pid()) -> ok. +stop(Pid) -> + gen_server:stop(Pid). + +%%-------------------------------------------------------------------- +%% gen_server API +%%-------------------------------------------------------------------- + +init([#{conninfo := OldConnInfo, clientinfo := #{clientid := ClientId} = OldClientInfo}]) -> + process_flag(trap_exit, true), + ClientInfo = clientinfo(OldClientInfo), + ConnInfo = conninfo(OldConnInfo), + case open_session(ConnInfo, ClientInfo) of + {ok, Channel0} -> + case set_expiry_timer(Channel0) of + {ok, Channel1} -> + ?LOG( + info, + "Channel initialized for client=~p on node=~p", + [ClientId, node()]), + {ok, Channel1, hibernate}; + {error, Reason} -> + {stop, Reason} + end; + {error, Reason} -> + {stop, Reason} + end. + +handle_call(kick, _From, Channel) -> + {stop, kicked, ok, Channel}; + +handle_call(discard, _From, Channel) -> + {stop, discarded, ok, Channel}; + +handle_call({takeover, 'begin'}, _From, #{session := Session} = Channel) -> + {reply, Session, Channel#{takeover => true}}; + +handle_call({takeover, 'end'}, _From, #{session := Session, + clientinfo := #{clientid := ClientId}, + pendings := Pendings} = Channel) -> + ok = emqx_session:takeover(Session), + %% TODO: Should not drain deliver here (side effect) + Delivers = emqx_misc:drain_deliver(), + AllPendings = lists:append(Delivers, Pendings), + ?tp(debug, + emqx_channel_takeover_end, + #{clientid => ClientId}), + {stop, normal, AllPendings, Channel}; + +handle_call(list_acl_cache, _From, Channel) -> + {reply, [], Channel}; + +handle_call({quota, _Policy}, _From, Channel) -> + {reply, ok, Channel}; + +handle_call(Req, _From, Channel) -> + ?LOG(error, "Unexpected call: ~p", [Req]), + {reply, ignored, Channel}. + +handle_info(Deliver = {deliver, _Topic, _Msg}, Channel) -> + Delivers = [Deliver | emqx_misc:drain_deliver()], + {noreply, handle_deliver(Delivers, Channel)}; + +handle_info(expire_session, Channel) -> + {stop, expired, Channel}; + +handle_info(Info, Channel) -> + ?LOG(error, "Unexpected info: ~p", [Info]), + {noreply, Channel}. + +handle_cast(Msg, Channel) -> + ?LOG(error, "Unexpected cast: ~p", [Msg]), + {noreply, Channel}. + +terminate(Reason, #{clientinfo := ClientInfo, session := Session} = Channel) -> + ok = cancel_expiry_timer(Channel), + emqx_session:terminate(ClientInfo, Reason, Session). + +code_change(_OldVsn, Channel, _Extra) -> + {ok, Channel}. + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +handle_deliver(Delivers, + #{takeover := true, + pendings := Pendings, + session := Session, + clientinfo := #{clientid := ClientId} = ClientInfo} = Channel) -> + %% NOTE: Order is important here. While the takeover is in + %% progress, the session cannot enqueue messages, since it already + %% passed on the queue to the new connection in the session state. + NPendings = lists:append( + Pendings, + ignore_local(ClientInfo, maybe_nack(Delivers), ClientId, Session)), + Channel#{pendings => NPendings}; + +handle_deliver(Delivers, + #{takeover := false, + session := Session, + clientinfo := #{clientid := ClientId} = ClientInfo} = Channel) -> + NSession = emqx_session:enqueue( + ClientInfo, + ignore_local(ClientInfo, maybe_nack(Delivers), ClientId, Session), + Session), + Channel#{session => NSession}. + +cancel_expiry_timer(#{expiry_timer := TRef}) when is_reference(TRef) -> + _ = erlang:cancel_timer(TRef), + ok; +cancel_expiry_timer(_) -> + ok. + +set_expiry_timer(#{conninfo := ConnInfo} = Channel) -> + case maps:get(expiry_interval, ConnInfo) of + ?UINT_MAX -> {ok, Channel}; + I when I > 0 -> + Timer = erlang:send_after(timer:seconds(I), self(), expire_session), + {ok, Channel#{expiry_timer => Timer}}; + _ -> + {error, should_be_expired} + end. + +open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) -> + Channel = channel(ConnInfo, ClientInfo), + case emqx_cm:open_session(false, ClientInfo, ConnInfo) of + {ok, #{present := false}} -> + ?LOG(info, "No session for clientid=~p", [ClientId]), + {error, no_session}; + {ok, #{session := Session, present := true, pendings := Pendings0}} -> + ?LOG(info, "Session opened for client=~p on node=~p", [ClientId, node()]), + Pendings1 = lists:usort(lists:append(Pendings0, emqx_misc:drain_deliver())), + NSession = emqx_session:enqueue( + ClientInfo, + ignore_local( + ClientInfo, + maybe_nack(Pendings1), + ClientId, + Session), + Session), + NChannel = Channel#{session => NSession}, + ok = emqx_cm:insert_channel_info(ClientId, info(NChannel), []), + ?LOG(info, "Channel info updated for client=~p on node=~p", [ClientId, node()]), + {ok, NChannel}; + {error, Reason} = Error -> + ?LOG(error, "Failed to open session due to ~p", [Reason]), + Error + end. + +conninfo(OldConnInfo) -> + DisconnectedAt = maps:get(disconnected_at, OldConnInfo, erlang:system_time(millisecond)), + ConnInfo0 = maps:with( + [socktype, + sockname, + peername, + peercert, + clientid, + clean_start, + receive_maximum, + expiry_interval], + OldConnInfo), + ConnInfo0#{ + conn_mod => ?MODULE, + connected => false, + disconnected_at => DisconnectedAt + }. + +clientinfo(OldClientInfo) -> + maps:with( + [zone, + protocol, + peerhost, + sockport, + clientid, + username, + is_bridge, + is_superuser, + mountpoint], + OldClientInfo). + +channel(ConnInfo, ClientInfo) -> + #{conninfo => ConnInfo, + clientinfo => ClientInfo, + expiry_timer => undefined, + takeover => false, + resuming => false, + pendings => [] + }. + +info(Channel) -> + #{conninfo => maps:get(conninfo, Channel, undefined), + clientinfo => maps:get(clientinfo, Channel, undefined), + session => maps:get(session, Channel, undefined), + conn_state => disconnected + }. + +ignore_local(ClientInfo, Delivers, Subscriber, Session) -> + Subs = emqx_session:info(subscriptions, Session), + lists:dropwhile(fun({deliver, Topic, #message{from = Publisher} = Msg}) -> + case maps:find(Topic, Subs) of + {ok, #{nl := 1}} when Subscriber =:= Publisher -> + ok = emqx_hooks:run('delivery.dropped', [ClientInfo, Msg, no_local]), + ok = emqx_metrics:inc('delivery.dropped'), + ok = emqx_metrics:inc('delivery.dropped.no_local'), + true; + _ -> + false + end + end, Delivers). + +maybe_nack(Delivers) -> + lists:filter(fun not_nacked/1, Delivers). + +not_nacked({deliver, _Topic, Msg}) -> + not (emqx_shared_sub:is_ack_required(Msg) + andalso (ok == emqx_shared_sub:nack_no_connection(Msg))). diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl new file mode 100644 index 000000000..632f8e480 --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl @@ -0,0 +1,42 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_cli). + +%% APIs +-export([ load/0 + , unload/0 + , cli/1 + ]). + +load() -> + emqx_ctl:register_command(eviction, {?MODULE, cli}, []). + +unload() -> + emqx_ctl:unregister_command(eviction). + +cli(["status"]) -> + case emqx_eviction_agent:status() of + disabled -> + emqx_ctl:print("Eviction status: disabled~n"); + {enabled, _Stats} -> + emqx_ctl:print("Eviction status: enabled~n") + end; + +cli(_) -> + emqx_ctl:usage( + [{"eviction status", + "Get current node eviction status"}]). diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl new file mode 100644 index 000000000..2b11ce9d1 --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl @@ -0,0 +1,33 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_conn_sup). + +-behaviour(supervisor). + +-export([start_link/1]). + +-export([init/1]). + +start_link(Env) -> + supervisor:start_link({local, ?MODULE}, ?MODULE, [Env]). + +init([_Env]) -> + Childs = [], + {ok, { + {one_for_one, 10, 3600}, + Childs} + }. diff --git a/apps/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl b/apps/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl new file mode 100644 index 000000000..9e28c2251 --- /dev/null +++ b/apps/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl @@ -0,0 +1,43 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_sup). + +-behaviour(supervisor). + +-export([start_link/1]). + +-export([init/1]). + +start_link(Env) -> + supervisor:start_link({local, ?MODULE}, ?MODULE, [Env]). + +init([_Env]) -> + Childs = [child_spec(worker, emqx_eviction_agent, []), + child_spec(supervisor, emqx_eviction_agent_conn_sup, [#{}])], + {ok, { + {one_for_one, 10, 3600}, + Childs} + }. + +child_spec(Type, Mod, Args) -> + #{id => Mod, + start => {Mod, start_link, Args}, + restart => permanent, + shutdown => 5000, + type => Type, + modules => [Mod] + }. diff --git a/apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl new file mode 100644 index 000000000..b77c0dee8 --- /dev/null +++ b/apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl @@ -0,0 +1,232 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import(emqx_eviction_agent_test_helpers, + [emqtt_connect/0, emqtt_connect/2]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(_Config) -> + emqx_ct_helpers:stop_apps([emqx_eviction_agent]). + +init_per_testcase(t_explicit_session_takeover, Config) -> + _ = emqx_eviction_agent:disable(test_eviction), + Node = emqx_node_helpers:start_slave( + evacuate1, + #{start_apps => [emqx, emqx_eviction_agent]}), + [{evacuate_node, Node} | Config]; +init_per_testcase(_TestCase, Config) -> + _ = emqx_eviction_agent:disable(test_eviction), + Config. + +end_per_testcase(t_explicit_session_takeover, Config) -> + _ = emqx_node_helpers:stop_slave(?config(evacuate_node, Config)), + _ = emqx_eviction_agent:disable(test_eviction); +end_per_testcase(_TestCase, _Config) -> + _ = emqx_eviction_agent:disable(test_eviction). + +t_enable_disable(_Config) -> + erlang:process_flag(trap_exit, true), + + ?assertMatch( + disabled, + emqx_eviction_agent:status()), + + {ok, C0} = emqtt_connect(), + ok = emqtt:disconnect(C0), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertMatch( + {error, eviction_agent_busy}, + emqx_eviction_agent:enable(bar, undefined)), + + ?assertMatch( + ok, + emqx_eviction_agent:enable(test_eviction, <<"srv">>)), + + ?assertMatch( + {enabled, #{}}, + emqx_eviction_agent:status()), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_connect()), + + ?assertMatch( + {error, eviction_agent_busy}, + emqx_eviction_agent:disable(bar)), + + ?assertMatch( + ok, + emqx_eviction_agent:disable(test_eviction)), + + ?assertMatch( + {error, disabled}, + emqx_eviction_agent:disable(test_eviction)), + + ?assertMatch( + disabled, + emqx_eviction_agent:status()), + + {ok, C1} = emqtt_connect(), + ok = emqtt:disconnect(C1). + + +t_evict_connections_status(_Config) -> + erlang:process_flag(trap_exit, true), + + {ok, _C} = emqtt_connect(), + + {error, disabled} = emqx_eviction_agent:evict_connections(1), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertMatch( + {enabled, #{connections := 1, sessions := _}}, + emqx_eviction_agent:status()), + + ok = emqx_eviction_agent:evict_connections(1), + + ct:sleep(100), + + ?assertMatch( + {enabled, #{connections := 0, sessions := _}}, + emqx_eviction_agent:status()), + + ok = emqx_eviction_agent:disable(test_eviction). + + +t_explicit_session_takeover(Config) -> + erlang:process_flag(trap_exit, true), + + {ok, C0} = emqtt_connect(<<"client_with_session">>, false), + {ok, _, _} = emqtt:subscribe(C0, <<"t1">>), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertEqual( + 1, + emqx_eviction_agent:connection_count()), + + ok = emqx_eviction_agent:evict_connections(1), + + receive + {'EXIT', C0, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok + after 1000 -> + ?assert(false, "Connection not evicted") + end, + + ?assertEqual( + 0, + emqx_eviction_agent:connection_count()), + + ?assertEqual( + 1, + emqx_eviction_agent:session_count()), + + %% First, evacuate to the same node + + ?check_trace( + ?wait_async_action( + emqx_eviction_agent:evict_sessions(1, node()), + #{?snk_kind := emqx_channel_takeover_end}, + 1000), + fun(_Result, Trace) -> + ?assertMatch( + [#{clientid := <<"client_with_session">>} | _ ], + ?of_kind(emqx_channel_takeover_end, Trace)) + end), + + ok = emqx_eviction_agent:disable(test_eviction), + ok = connect_and_publish(<<"t1">>, <<"MessageToEvictedSession1">>), + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + %% Evacuate to another node + + TargetNodeForEvacuation = ?config(evacuate_node, Config), + ?check_trace( + ?wait_async_action( + emqx_eviction_agent:evict_sessions(1, TargetNodeForEvacuation), + #{?snk_kind := emqx_channel_takeover_end}, + 1000), + fun(_Result, Trace) -> + ?assertMatch( + [#{clientid := <<"client_with_session">>} | _ ], + ?of_kind(emqx_channel_takeover_end, Trace)) + end), + + ?assertEqual( + 0, + emqx_eviction_agent:session_count()), + + ?assertEqual( + 1, + rpc:call(TargetNodeForEvacuation, emqx_eviction_agent, session_count, [])), + + ok = emqx_eviction_agent:disable(test_eviction), + + ct:pal("evicted chann info: ~p", [emqx_cm:get_chan_info(<<"client_with_session">>)]), + + ok = connect_and_publish(<<"t1">>, <<"MessageToEvictedSession2">>), + ct:sleep(100), + + {ok, C2} = emqtt_connect(<<"client_with_session">>, false), + + ok = assert_receive_publish( + [#{payload => <<"MessageToEvictedSession1">>, topic => <<"t1">>}, + #{payload => <<"MessageToEvictedSession2">>, topic => <<"t1">>}]), + ok = emqtt:disconnect(C2). + +t_disable_on_restart(_Config) -> + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ok = supervisor:terminate_child(emqx_eviction_agent_sup, emqx_eviction_agent), + {ok, _} = supervisor:restart_child(emqx_eviction_agent_sup, emqx_eviction_agent), + + ?assertEqual( + disabled, + emqx_eviction_agent:status()). + +assert_receive_publish([]) -> ok; +assert_receive_publish([#{payload := Msg, topic := Topic} | Rest]) -> + receive + {publish, #{payload := Msg, + topic := Topic}} -> + assert_receive_publish(Rest) + after 1000 -> + ?assert(false, "Message `" ++ binary_to_list(Msg) ++ "` is lost") + end. + +connect_and_publish(Topic, Message) -> + {ok, C} = emqtt_connect(), + emqtt:publish(C, Topic, Message), + ok = emqtt:disconnect(C). diff --git a/apps/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl new file mode 100644 index 000000000..f9585c0e6 --- /dev/null +++ b/apps/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl @@ -0,0 +1,64 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import(emqx_mgmt_api_test_helpers, + [request_api/3, + auth_header_/0, + api_path/1]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_management]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_management, emqx_eviction_agent]), + Config. + +t_status(_Config) -> + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["node_eviction", "status"])), + + ok = emqx_eviction_agent:enable(apitest, undefined), + + ?assertMatch( + {ok, #{<<"status">> := <<"enabled">>, + <<"stats">> := #{}}}, + api_get(["node_eviction", "status"])), + + ok = emqx_eviction_agent:disable(apitest), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["node_eviction", "status"])). + +api_get(Path) -> + case request_api(get, api_path(Path), auth_header_()) of + {ok, ResponseBody} -> + {ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])}; + {error, _} = Error -> Error + end. diff --git a/apps/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl new file mode 100644 index 000000000..cbbf1be5a --- /dev/null +++ b/apps/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl @@ -0,0 +1,155 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_channel_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-define(CLIENT_ID, <<"client_with_session">>). + +-import(emqx_eviction_agent_test_helpers, + [emqtt_connect/2]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(_Config) -> + emqx_ct_helpers:stop_apps([emqx_eviction_agent]). + +t_start_no_session(_Config) -> + Opts = #{clientinfo => #{clientid => ?CLIENT_ID, + zone => internal}, + conninfo => #{clientid => ?CLIENT_ID, + receive_maximum => 32}}, + ?assertMatch( + {error, {no_session, _}}, + emqx_eviction_agent_channel:start_supervised(Opts)). + +t_start_no_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + Opts = #{clientinfo => #{clientid => ?CLIENT_ID, + zone => internal}, + conninfo => #{clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => 0}}, + ?assertMatch( + {error, {should_be_expired, _}}, + emqx_eviction_agent_channel:start_supervised(Opts)). + +t_start_infinite_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + Opts = #{clientinfo => #{clientid => ?CLIENT_ID, + zone => internal}, + conninfo => #{clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => ?UINT_MAX}}, + ?assertMatch( + {ok, _}, + emqx_eviction_agent_channel:start_supervised(Opts)). + +t_kick(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, kick)). + +t_discard(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, discard)). + +t_stop(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:stop(Pid)). + + +t_ignored_calls(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ok = emqx_eviction_agent_channel:cast(Pid, unknown), + Pid ! unknown, + + ?assertEqual( + [], + emqx_eviction_agent_channel:call(Pid, list_acl_cache)), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, {quota, quota})), + + ?assertEqual( + ignored, + emqx_eviction_agent_channel:call(Pid, unknown)). + +t_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + #{conninfo := ConnInfo} = Opts0 = evict_session_opts(?CLIENT_ID), + Opts1 = Opts0#{conninfo => ConnInfo#{expiry_interval => 1}}, + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts1), + + ct:sleep(1500), + + ?assertNot(is_process_alive(Pid)). + +evict_session_opts(ClientId) -> + maps:with( + [conninfo, clientinfo], + emqx_cm:get_chan_info(ClientId)). diff --git a/apps/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl new file mode 100644 index 000000000..9c05cdb21 --- /dev/null +++ b/apps/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl @@ -0,0 +1,47 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_cli_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(Config) -> + _ = emqx_eviction_agent:disable(foo), + emqx_ct_helpers:stop_apps([emqx_eviction_agent]), + Config. + +t_status(_Config) -> + %% usage + ok = emqx_eviction_agent_cli:cli(["foobar"]), + + %% status + ok = emqx_eviction_agent_cli:cli(["status"]), + + ok = emqx_eviction_agent:enable(foo, undefined), + + %% status + ok = emqx_eviction_agent_cli:cli(["status"]). diff --git a/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl b/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl new file mode 100644 index 000000000..68b86f059 --- /dev/null +++ b/apps/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl @@ -0,0 +1,55 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_test_helpers). + +-export([emqtt_connect/0, + emqtt_connect/2, + emqtt_connect_many/1, + emqtt_try_connect/0]). + +emqtt_connect() -> + emqtt_connect(<<"client1">>, true). + +emqtt_connect(ClientId, CleanStart) -> + {ok, C} = emqtt:start_link( + [{clientid, ClientId}, + {clean_start, CleanStart}, + {proto_ver, v5}, + {properties, #{'Session-Expiry-Interval' => 600}} + ]), + case emqtt:connect(C) of + {ok, _} -> {ok, C}; + {error, _} = Error -> Error + end. + +emqtt_connect_many(Count) -> + lists:map( + fun(N) -> + NBin = integer_to_binary(N), + ClientId = <<"client-", NBin/binary>>, + {ok, C} = emqtt_connect(ClientId, false), + C + end, + lists:seq(1, Count)). + +emqtt_try_connect() -> + case emqtt_connect() of + {ok, C} -> + emqtt:disconnect(C), + ok; + {error, _} = Error -> Error + end. diff --git a/apps/emqx_management/test/emqx_mgmt_api_SUITE.erl b/apps/emqx_management/test/emqx_mgmt_api_SUITE.erl index cfed74c16..fda5c2adf 100644 --- a/apps/emqx_management/test/emqx_mgmt_api_SUITE.erl +++ b/apps/emqx_management/test/emqx_mgmt_api_SUITE.erl @@ -25,13 +25,12 @@ -include_lib("emqx/include/emqx_mqtt.hrl"). -include_lib("emqx_management/include/emqx_mgmt.hrl"). --define(CONTENT_TYPE, "application/x-www-form-urlencoded"). - --define(HOST, "http://127.0.0.1:8081/"). - --define(API_VERSION, "v4"). - --define(BASE_PATH, "api"). +-import(emqx_mgmt_api_test_helpers, + [request_api/3, + request_api/4, + request_api/5, + auth_header_/0, + api_path/1]). all() -> emqx_ct:all(?MODULE). @@ -657,49 +656,6 @@ t_data_import_content(_) -> application:stop(emqx_rule_engine), application:stop(emqx_dashboard). -request_api(Method, Url, Auth) -> - request_api(Method, Url, [], Auth, []). - -request_api(Method, Url, QueryParams, Auth) -> - request_api(Method, Url, QueryParams, Auth, []). - -request_api(Method, Url, QueryParams, Auth, []) -> - NewUrl = case QueryParams of - "" -> Url; - _ -> Url ++ "?" ++ QueryParams - end, - do_request_api(Method, {NewUrl, [Auth]}); -request_api(Method, Url, QueryParams, Auth, Body) -> - NewUrl = case QueryParams of - "" -> Url; - _ -> Url ++ "?" ++ QueryParams - end, - do_request_api(Method, {NewUrl, [Auth], "application/json", emqx_json:encode(Body)}). - -do_request_api(Method, Request)-> - ct:pal("Method: ~p, Request: ~p", [Method, Request]), - case httpc:request(Method, Request, [], []) of - {error, socket_closed_remotely} -> - {error, socket_closed_remotely}; - {ok, {{"HTTP/1.1", Code, _}, _, Return} } - when Code =:= 200 orelse Code =:= 201 -> - {ok, Return}; - {ok, {Reason, _, _}} -> - {error, Reason} - end. - -auth_header_() -> - AppId = <<"admin">>, - AppSecret = <<"public">>, - auth_header_(binary_to_list(AppId), binary_to_list(AppSecret)). - -auth_header_(User, Pass) -> - Encoded = base64:encode_to_string(lists:append([User,":",Pass])), - {"Authorization","Basic " ++ Encoded}. - -api_path(Parts)-> - ?HOST ++ filename:join([?BASE_PATH, ?API_VERSION] ++ Parts). - filter(List, Key, Value) -> lists:filter(fun(Item) -> maps:get(Key, Item) == Value diff --git a/apps/emqx_management/test/emqx_mgmt_api_test_helpers.erl b/apps/emqx_management/test/emqx_mgmt_api_test_helpers.erl new file mode 100644 index 000000000..a943ca760 --- /dev/null +++ b/apps/emqx_management/test/emqx_mgmt_api_test_helpers.erl @@ -0,0 +1,69 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2020-2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_mgmt_api_test_helpers). + +-compile(export_all). +-compile(nowarn_export_all). + +-define(HOST, "http://127.0.0.1:8081/"). + +-define(API_VERSION, "v4"). + +-define(BASE_PATH, "api"). + +request_api(Method, Url, Auth) -> + request_api(Method, Url, [], Auth, []). + +request_api(Method, Url, QueryParams, Auth) -> + request_api(Method, Url, QueryParams, Auth, []). + +request_api(Method, Url, QueryParams, Auth, []) -> + NewUrl = case QueryParams of + "" -> Url; + _ -> Url ++ "?" ++ QueryParams + end, + do_request_api(Method, {NewUrl, [Auth]}); +request_api(Method, Url, QueryParams, Auth, Body) -> + NewUrl = case QueryParams of + "" -> Url; + _ -> Url ++ "?" ++ QueryParams + end, + do_request_api(Method, {NewUrl, [Auth], "application/json", emqx_json:encode(Body)}). + +do_request_api(Method, Request)-> + ct:pal("Method: ~p, Request: ~p", [Method, Request]), + case httpc:request(Method, Request, [], []) of + {error, socket_closed_remotely} -> + {error, socket_closed_remotely}; + {ok, {{"HTTP/1.1", Code, _}, _, Return} } + when Code =:= 200 orelse Code =:= 201 -> + {ok, Return}; + {ok, {Reason, _, _}} -> + {error, Reason} + end. + +auth_header_() -> + AppId = <<"admin">>, + AppSecret = <<"public">>, + auth_header_(binary_to_list(AppId), binary_to_list(AppSecret)). + +auth_header_(User, Pass) -> + Encoded = base64:encode_to_string(lists:append([User,":",Pass])), + {"Authorization","Basic " ++ Encoded}. + +api_path(Parts)-> + ?HOST ++ filename:join([?BASE_PATH, ?API_VERSION] ++ Parts). diff --git a/apps/emqx_node_rebalance/.gitignore b/apps/emqx_node_rebalance/.gitignore new file mode 100644 index 000000000..f1c455451 --- /dev/null +++ b/apps/emqx_node_rebalance/.gitignore @@ -0,0 +1,19 @@ +.rebar3 +_* +.eunit +*.o +*.beam +*.plt +*.swp +*.swo +.erlang.cookie +ebin +log +erl_crash.dump +.rebar +logs +_build +.idea +*.iml +rebar3.crashdump +*~ diff --git a/apps/emqx_node_rebalance/README.md b/apps/emqx_node_rebalance/README.md new file mode 100644 index 000000000..2e56f62cd --- /dev/null +++ b/apps/emqx_node_rebalance/README.md @@ -0,0 +1,9 @@ +emqx_node_rebalance +===== + +An OTP library + +Build +----- + + $ rebar3 compile diff --git a/apps/emqx_node_rebalance/etc/emqx_node_rebalance.conf b/apps/emqx_node_rebalance/etc/emqx_node_rebalance.conf new file mode 100644 index 000000000..8ace22435 --- /dev/null +++ b/apps/emqx_node_rebalance/etc/emqx_node_rebalance.conf @@ -0,0 +1,3 @@ +##-------------------------------------------------------------------- +## EMQX Node Rebalance Plugin +##-------------------------------------------------------------------- diff --git a/apps/emqx_node_rebalance/include/emqx_node_rebalance.hrl b/apps/emqx_node_rebalance/include/emqx_node_rebalance.hrl new file mode 100644 index 000000000..1903b87cc --- /dev/null +++ b/apps/emqx_node_rebalance/include/emqx_node_rebalance.hrl @@ -0,0 +1,31 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-define(DEFAULT_CONN_EVICT_RATE, 500). +-define(DEFAULT_SESS_EVICT_RATE, 500). + +-define(DEFAULT_WAIT_HEALTH_CHECK, 60). %% sec +-define(DEFAULT_WAIT_TAKEOVER, 60). %% sec + +-define(DEFAULT_ABS_CONN_THRESHOLD, 1000). +-define(DEFAULT_ABS_SESS_THRESHOLD, 1000). + +-define(DEFAULT_REL_CONN_THRESHOLD, 1.1). +-define(DEFAULT_REL_SESS_THRESHOLD, 1.1). + +-define(EVICT_INTERVAL, 1000). + +-define(EVACUATION_FILENAME, <<".evacuation">>). diff --git a/apps/emqx_node_rebalance/priv/emqx_node_rebalance.schema b/apps/emqx_node_rebalance/priv/emqx_node_rebalance.schema new file mode 100644 index 000000000..e69de29bb diff --git a/apps/emqx_node_rebalance/rebar.config b/apps/emqx_node_rebalance/rebar.config new file mode 100644 index 000000000..2656fd554 --- /dev/null +++ b/apps/emqx_node_rebalance/rebar.config @@ -0,0 +1,2 @@ +{erl_opts, [debug_info]}. +{deps, []}. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src new file mode 100644 index 000000000..761cd1d6f --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance.app.src @@ -0,0 +1,19 @@ +{application, emqx_node_rebalance, + [{description, "EMQX Node Rebalance"}, + {vsn, "4.3.0"}, + {registered, [emqx_node_rebalance_sup, + emqx_node_rebalance, + emqx_node_rebalance_agent, + emqx_node_rebalance_evacuation]}, + {applications, + [kernel, + stdlib + ]}, + {mod, {emqx_node_rebalance_app,[]}}, + {env,[]}, + {modules, []}, + {maintainers, ["EMQX Team "]}, + {links, [{"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx"} + ]} + ]}. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl new file mode 100644 index 000000000..1edbbbe3b --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance.erl @@ -0,0 +1,414 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([start/1, + status/0, + status/1, + stop/0 + ]). + +-export([start_link/0]). + +-behavior(gen_statem). + +-export([init/1, + callback_mode/0, + handle_event/4, + code_change/4 + ]). + +-export([is_node_available/0, + available_nodes/1, + connection_count/0, + session_count/0, + disconnected_session_count/0]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type start_opts() :: #{conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_health_check => pos_integer(), + wait_takeover => pos_integer(), + abs_conn_threshold => pos_integer(), + rel_conn_threshold => number(), + abs_sess_threshold => pos_integer(), + rel_sess_threshold => number(), + nodes => [node()] + }. +-type start_error() :: already_started | [{node(), term()}]. + + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> disabled | {enabled, map()}. +status() -> + gen_statem:call(?MODULE, status). + +-spec status(pid()) -> disabled | {enabled, map()}. +status(Pid) -> + gen_statem:call(Pid, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec available_nodes(list(node())) -> list(node()). +available_nodes(Nodes) when is_list(Nodes) -> + {Available, _} = rpc:multicall(Nodes, ?MODULE, is_node_available, []), + lists:filter(fun is_atom/1, Available). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, wait_health_check, evicting_conns, wait_takeover, evicting_sessions + +init([]) -> + ?tp(debug, emqx_node_rebalance_started, #{}), + {ok, disabled, #{}}. + +%% start +handle_event({call, From}, + {start, #{wait_health_check := WaitHealthCheck} = Opts}, + disabled, + #{} = Data) -> + case enable_rebalance(Data#{opts => Opts}) of + {ok, NewData} -> + ?LOG(warning, "Node rebalance enabled: ~p", [Opts]), + {next_state, + wait_health_check, + NewData, + [{state_timeout, seconds(WaitHealthCheck), evict_conns}, + {reply, From, ok}]}; + {error, Reason} -> + ?LOG(warning, "Node rebalance enabling failed: ~p", [Reason]), + {keep_state_and_data, + [{reply, From, {error, Reason}}]} + end; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, + [{reply, From, {error, already_started}}]}; + +%% stop +handle_event({call, From}, stop, disabled, #{}) -> + {keep_state_and_data, + [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = disable_rebalance(Data), + ?LOG(warning, "Node rebalance stopped"), + {next_state, + disabled, + deinit(Data), + [{reply, From, ok}]}; + +%% status +handle_event({call, From}, status, disabled, #{}) -> + {keep_state_and_data, + [{reply, From, disabled}]}; +handle_event({call, From}, status, State, Data) -> + Stats = get_stats(State, Data), + {keep_state_and_data, + [{reply, From, {enabled, Stats#{state => State, + coordinator_node => node()}}}]}; + +%% conn eviction +handle_event(state_timeout, + evict_conns, + wait_health_check, + Data) -> + ?LOG(warning, "Node rebalance wait_health_check over"), + {next_state, + evicting_conns, + Data, + [{state_timeout, 0, evict_conns}]}; + +handle_event(state_timeout, + evict_conns, + evicting_conns, + #{opts := #{wait_takeover := WaitTakeover, + evict_interval := EvictInterval}} = Data) -> + case evict_conns(Data) of + ok -> + ?LOG(warning, "Node rebalance evict_conns over"), + {next_state, + wait_takeover, + Data, + [{state_timeout, seconds(WaitTakeover), evict_sessions}]}; + {continue, NewData} -> + {keep_state, + NewData, + [{state_timeout, EvictInterval, evict_conns}]} + end; + +handle_event(state_timeout, + evict_sessions, + wait_takeover, + Data) -> + ?LOG(warning, "Node rebalance wait_takeover over"), + {next_state, + evicting_sessions, + Data, + [{state_timeout, 0, evict_sessions}]}; + +handle_event(state_timeout, + evict_sessions, + evicting_sessions, + #{opts := #{evict_interval := EvictInterval}} = Data) -> + case evict_sessions(Data) of + ok -> + ?tp(debug, emqx_node_rebalance_evict_sess_over, #{}), + ?LOG(warning, "Node rebalance evict_sess over"), + ok = disable_rebalance(Data), + ?LOG(warning, "Rebalance finished successfully"), + {next_state, + disabled, + deinit(Data)}; + {continue, NewData} -> + {keep_state, + NewData, + [{state_timeout, EvictInterval, evict_sessions}]} + end; + +handle_event({call, From}, Msg, State, Data) -> + ?LOG(warning, "Unknown call: ~p, State: ~p, Data: ~p", [Msg, State, Data]), + {keep_state_and_data, + [{reply, From, ignored}]}; + +handle_event(info, Msg, State, Data) -> + ?LOG(warning, "Unknown Msg: ~p, State: ~p, Data: ~p", [Msg, State, Data]), + keep_state_and_data; + +handle_event(cast, Msg, State, Data) -> + ?LOG(warning, "Unknown cast Msg: ~p, State: ~p, Data: ~p", [Msg, State, Data]), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +enable_rebalance(#{opts := Opts} = Data) -> + Nodes = maps:get(nodes, Opts), + ConnCounts = multicall(Nodes, {?MODULE, connection_count, []}), + SessCounts = multicall(Nodes, {?MODULE, session_count, []}), + {_, Counts} = lists:unzip(ConnCounts), + Avg = avg(Counts), + {DonorCounts, + RecipientCounts} = lists:partition( + fun({_Node, Count}) -> + Count >= Avg + end, + ConnCounts), + ?LOG(warning, "Enabling rebalance: ConnCounts=~p, DonorCounts=~p, RecipientCounts=~p", + [ConnCounts, DonorCounts, RecipientCounts]), + {DonorNodes, _} = lists:unzip(DonorCounts), + {RecipientNodes, _} = lists:unzip(RecipientCounts), + case need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) of + false -> {error, nothing_to_balance}; + true -> + _ = multicall(DonorNodes, {emqx_node_rebalance_agent, enable, [self()]}), + {ok, Data#{donors => DonorNodes, + recipients => RecipientNodes, + initial_conn_counts => maps:from_list(ConnCounts), + initial_sess_counts => maps:from_list(SessCounts)}} + end. + +disable_rebalance(#{donors := DonorNodes}) -> + _ = multicall(DonorNodes, {emqx_node_rebalance_agent, disable, [self()]}), + ok. + +evict_conns(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) -> + DonorNodeCounts = multicall(DonorNodes, {?MODULE, connection_count, []}), + {_, DonorCounts} = lists:unzip(DonorNodeCounts), + RecipientNodeCounts = multicall(RecipientNodes, {?MODULE, connection_count, []}), + {_, RecipientCounts} = lists:unzip(RecipientNodeCounts), + + DonorAvg = avg(DonorCounts), + RecipientAvg = avg(RecipientCounts), + Thresholds = thresholds(conn, Opts), + NewData = Data#{donor_conn_avg => DonorAvg, + recipient_conn_avg => RecipientAvg, + donor_conn_counts => maps:from_list(DonorNodeCounts), + recipient_conn_counts => maps:from_list(RecipientNodeCounts)}, + case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of + true -> ok; + false -> + ConnEvictRate = maps:get(conn_evict_rate, Opts), + NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts), + ?LOG(warning, "Node rebalance, evict_conns, nodes=~p, counts=~p", + [NodesToEvict, ConnEvictRate]), + _ = multicall(NodesToEvict, {emqx_eviction_agent, evict_connections, [ConnEvictRate]}), + {continue, NewData} + end. + +evict_sessions(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) -> + DonorNodeCounts = multicall(DonorNodes, {?MODULE, disconnected_session_count, []}), + {_, DonorCounts} = lists:unzip(DonorNodeCounts), + RecipientNodeCounts = multicall(RecipientNodes, {?MODULE, disconnected_session_count, []}), + {_, RecipientCounts} = lists:unzip(RecipientNodeCounts), + + DonorAvg = avg(DonorCounts), + RecipientAvg = avg(RecipientCounts), + Thresholds = thresholds(sess, Opts), + NewData = Data#{donor_sess_avg => DonorAvg, + recipient_sess_avg => RecipientAvg, + donor_sess_counts => maps:from_list(DonorNodeCounts), + recipient_sess_counts => maps:from_list(RecipientNodeCounts)}, + case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of + true -> ok; + false -> + SessEvictRate = maps:get(sess_evict_rate, Opts), + NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts), + ?LOG(warning, "Node rebalance, evict_sessions, nodes=~p, counts=~p", + [NodesToEvict, SessEvictRate]), + _ = multicall(NodesToEvict, + {emqx_eviction_agent, + evict_sessions, + [SessEvictRate, RecipientNodes, disconnected]}), + {continue, NewData} + end. + +need_rebalance([] = _DonorNodes, _RecipientNodes, _ConnCounts, _SessCounts, _Opts) -> false; +need_rebalance(_DonorNodes, [] = _RecipientNodes, _ConnCounts, _SessCounts, _Opts) -> false; +need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) -> + DonorConnAvg = avg_for_nodes(DonorNodes, ConnCounts), + RecipientConnAvg = avg_for_nodes(RecipientNodes, ConnCounts), + DonorSessAvg = avg_for_nodes(DonorNodes, SessCounts), + RecipientSessAvg = avg_for_nodes(RecipientNodes, SessCounts), + Result = (not within_thresholds(DonorConnAvg, RecipientConnAvg, thresholds(conn, Opts))) + orelse (not within_thresholds(DonorSessAvg, RecipientSessAvg, thresholds(sess, Opts))), + ?tp(debug, emqx_node_rebalance_need_rebalance, + #{donors => DonorNodes, + recipients => RecipientNodes, + conn_counts => ConnCounts, + sess_counts => SessCounts, + opts => Opts, + result => Result + }), + Result. + +avg_for_nodes(Nodes, Counts) -> + avg(maps:values(maps:with(Nodes, maps:from_list(Counts)))). + +within_thresholds(Value, GoalValue, {AbsThres, RelThres}) -> + (Value =< GoalValue + AbsThres) orelse (Value =< GoalValue * RelThres). + +thresholds(conn, #{abs_conn_threshold := Abs, rel_conn_threshold := Rel}) -> + {Abs, Rel}; +thresholds(sess, #{abs_sess_threshold := Abs, rel_sess_threshold := Rel}) -> + {Abs, Rel}. + +nodes_to_evict(Goal, NodeCounts) -> + {Nodes, _} = lists:unzip( + lists:filter( + fun({_Node, Count}) -> + Count > Goal + end, + NodeCounts)), + Nodes. + +get_stats(disabled, _Data) -> #{}; +get_stats(_State, Data) -> Data. + +avg(List) when length(List) >= 1 -> + lists:sum(List) / length(List). + +multicall(Nodes, {M, F, A}) -> + case rpc:multicall(Nodes, M, F, A) of + {Results, []} -> + case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of + {OkResults, []} -> + [{Node, ok_result(Result)} || {Node, Result} <- OkResults]; + {_, BadResults} -> + error({bad_nodes, BadResults}) + end; + {_, [_BadNode | _] = BadNodes} -> + error({bad_nodes, BadNodes}) + end. + +is_ok({_Node, {ok, _}}) -> true; +is_ok({_Node, ok}) -> true; +is_ok(_) -> false. + +ok_result({ok, Result}) -> Result; +ok_result(ok) -> ok. + +connection_count() -> + {ok, emqx_eviction_agent:connection_count()}. + +session_count() -> + {ok, emqx_eviction_agent:session_count()}. + +disconnected_session_count() -> + {ok, emqx_eviction_agent:session_count(disconnected)}. + +default_opts() -> + #{ + conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE, + abs_conn_threshold => ?DEFAULT_ABS_CONN_THRESHOLD, + rel_conn_threshold => ?DEFAULT_REL_CONN_THRESHOLD, + + sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE, + abs_sess_threshold => ?DEFAULT_ABS_SESS_THRESHOLD, + rel_sess_threshold => ?DEFAULT_REL_SESS_THRESHOLD, + + wait_health_check => ?DEFAULT_WAIT_HEALTH_CHECK, + wait_takeover => ?DEFAULT_WAIT_TAKEOVER, + + evict_interval => ?EVICT_INTERVAL, + + nodes => all_nodes() + }. + + +deinit(Data) -> + Keys = [recipient_conn_avg, recipient_sess_avg, donor_conn_avg, donor_sess_avg, + recipient_conn_counts, recipient_sess_counts, donor_conn_counts, donor_sess_counts, + initial_conn_counts, initial_sess_counts, + opts], + maps:without(Keys, Data). + +is_node_available() -> + true = is_pid(whereis(emqx_node_rebalance_agent)), + disabled = emqx_eviction_agent:status(), + node(). + +all_nodes() -> + ekka_mnesia:cluster_nodes(all). + +seconds(Sec) -> + round(timer:seconds(Sec)). diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl new file mode 100644 index 000000000..ade043d6b --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl @@ -0,0 +1,127 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_agent). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([start_link/0, + enable/1, + disable/1, + status/0 + ]). + +-export([init/1, + handle_call/3, + handle_info/2, + handle_cast/2, + code_change/3 + ]). + +-define(ENABLE_KIND, emqx_node_rebalance). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type status() :: {enabled, pid()} | disabled. + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec enable(pid()) -> ok_or_error(already_enabled | eviction_agent_busy). +enable(CoordinatorPid) -> + gen_server:call(?MODULE, {enable, CoordinatorPid}). + +-spec disable(pid()) -> ok_or_error(already_disabled | invalid_coordinator). +disable(CoordinatorPid) -> + gen_server:call(?MODULE, {disable, CoordinatorPid}). + +-spec status() -> status(). +status() -> + gen_server:call(?MODULE, status). + +%%-------------------------------------------------------------------- +%% gen_server callbacks +%%-------------------------------------------------------------------- + +init([]) -> + {ok, #{}}. + +handle_call({enable, CoordinatorPid}, _From, St) -> + case St of + #{coordinator_pid := _Pid} -> + {reply, {error, already_enabled}, St}; + _ -> + true = link(CoordinatorPid), + EvictionAgentPid = whereis(emqx_eviction_agent), + true = link(EvictionAgentPid), + case emqx_eviction_agent:enable(?ENABLE_KIND, undefined) of + ok -> + {reply, ok, #{coordinator_pid => CoordinatorPid, + eviction_agent_pid => EvictionAgentPid}}; + {error, eviction_agent_busy} -> + true = unlink(EvictionAgentPid), + true = unlink(CoordinatorPid), + {reply, {error, eviction_agent_busy}, St} + end + end; + +handle_call({disable, CoordinatorPid}, _From, St) -> + case St of + #{coordinator_pid := CoordinatorPid, + eviction_agent_pid := EvictionAgentPid} -> + _ = emqx_eviction_agent:disable(?ENABLE_KIND), + true = unlink(EvictionAgentPid), + true = unlink(CoordinatorPid), + NewSt = maps:without( + [coordinator_pid, eviction_agent_pid], + St), + {reply, ok, NewSt}; + #{coordinator_pid := _CoordinatorPid} -> + {reply, {error, invalid_coordinator}, St}; + #{} -> + {reply, {error, already_disabled}, St} + end; + +handle_call(status, _From, St) -> + case St of + #{coordinator_pid := Pid} -> + {reply, {enabled, Pid}, St}; + _ -> + {reply, disabled, St} + end; + +handle_call(Msg, _From, St) -> + ?LOG(warning, "Unknown call: ~p, State: ~p", [Msg, St]), + {reply, ignored, St}. + +handle_info(Msg, St) -> + ?LOG(warning, "Unknown Msg: ~p, State: ~p", [Msg, St]), + {noreply, St}. + +handle_cast(Msg, St) -> + ?LOG(warning, "Unknown cast Msg: ~p, State: ~p", [Msg, St]), + {noreply, St}. + +code_change(_Vsn, State, _Extra) -> + {ok, State}. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl new file mode 100644 index 000000000..6ba221cd8 --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -0,0 +1,243 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api). + +-import(minirest, [return/1]). + +-rest_api(#{name => load_rebalance_status, + method => 'GET', + path => "/load_rebalance/status", + func => status, + descr => "Get load rebalance status"}). + +-rest_api(#{name => load_rebalance_global_status, + method => 'GET', + path => "/load_rebalance/global_status", + func => global_status, + descr => "Get status of all rebalance/evacuation processes across the cluster"}). + +-rest_api(#{name => load_rebalance_availability_check, + method => 'GET', + path => "/load_rebalance/availability_check", + func => availability_check, + descr => "Node rebalance availability check"}). + +-rest_api(#{name => load_rebalance_start, + method => 'POST', + path => "/load_rebalance/:bin:node/start", + func => rebalance_start, + descr => "Start rebalancing with the node as coordinator"}). + +-rest_api(#{name => load_rebalance_stop, + method => 'POST', + path => "/load_rebalance/:bin:node/stop", + func => rebalance_stop, + descr => "Stop rebalancing coordinated by the node"}). + +-rest_api(#{name => load_rebalance_evacuation_start, + method => 'POST', + path => "/load_rebalance/:bin:node/evacuation/start", + func => rebalance_evacuation_start, + descr => "Start evacuation on a node "}). + +-rest_api(#{name => load_rebalance_evacuation_stop, + method => 'POST', + path => "/load_rebalance/:bin:node/evacuation/stop", + func => rebalance_evacuation_stop, + descr => "Stop evacuation on the node"}). + +-export([status/2, + availability_check/2, + global_status/2, + + rebalance_evacuation_start/2, + rebalance_evacuation_stop/2, + rebalance_start/2, + rebalance_stop/2 + ]). + +status(_Bindings, _Params) -> + case emqx_node_rebalance_status:local_status() of + disabled -> + {ok, #{status => disabled}}; + {rebalance, Stats} -> + {ok, format_status(rebalance, Stats)}; + {evacuation, Stats} -> + {ok, format_status(evacuation, Stats)} + end. + +global_status(_Bindings, _Params) -> + #{evacuations := Evacuations, + rebalances := Rebalances} = emqx_node_rebalance_status:global_status(), + {ok, #{evacuations => maps:from_list(Evacuations), + rebalances => maps:from_list(Rebalances)}}. + +availability_check(_Bindings, _Params) -> + case emqx_eviction_agent:status() of + disabled -> + {200, #{}}; + {enabled, _Stats} -> + {503, #{}} + end. + +rebalance_evacuation_start(#{node := NodeBin}, Params) -> + validated( + fun() -> + {Node, Opts} = validate_evacuation(NodeBin, params(Params)), + rpc(Node, emqx_node_rebalance_evacuation, start, [Opts]) + end). + +rebalance_evacuation_stop(#{node := NodeBin}, _Params) -> + validated( + fun() -> + Node = parse_node(NodeBin), + rpc(Node, emqx_node_rebalance_evacuation, stop, []) + end). + +rebalance_start(#{node := NodeBin}, Params) -> + validated( + fun() -> + {Node, Opts} = validate_rebalance(NodeBin, params(Params)), + rpc(Node, emqx_node_rebalance, start, [Opts]) + end). + +rebalance_stop(#{node := NodeBin}, _Params) -> + validated( + fun() -> + Node = parse_node(NodeBin), + rpc(Node, emqx_node_rebalance, stop, []) + end). + +rpc(Node, M, F, A) -> + case rpc:call(Node, M, F, A) of + ok -> return({ok, []}); + {error, Error} -> + return({error, 400, io_lib:format("~p", [Error])}); + {badrpc, _} -> + return({error, 400, io_lib:format("Error communicating with node ~p", [Node])}); + Unknown -> + return({error, 400, io_lib:format("Unrecognized rpc result from node ~p: ~p", + [Node, Unknown])}) + end. + +format_status(Process, Stats) -> + Stats#{process => Process, status => enabled}. + +validate_evacuation(Node, Params) -> + NodeToEvacuate = parse_node(Node), + OptList = lists:map( + fun validate_evacuation_param/1, + Params), + {NodeToEvacuate, maps:from_list(OptList)}. + +validate_rebalance(Node, Params) -> + CoordinatorNode = parse_node(Node), + OptList = lists:map( + fun validate_rebalance_param/1, + Params), + {CoordinatorNode, maps:from_list(OptList)}. + +validate_evacuation_param({<<"conn_evict_rate">>, Value}) -> + validate_pos_int(conn_evict_rate, Value); +validate_evacuation_param({<<"sess_evict_rate">>, Value}) -> + validate_pos_int(sess_evict_rate, Value); +validate_evacuation_param({<<"redirect_to">>, Value}) -> + validate_binary(server_reference, Value); +validate_evacuation_param({<<"wait_takeover">>, Value}) -> + validate_pos_int(wait_takeover, Value); +validate_evacuation_param({<<"migrate_to">>, Value}) -> + validate_nodes(migrate_to, Value); +validate_evacuation_param(Value) -> + validation_error(io_lib:format("Unknown evacuation param: ~p", [Value])). + +validate_rebalance_param({<<"wait_health_check">>, Value}) -> + validate_pos_int(wait_health_check, Value); +validate_rebalance_param({<<"conn_evict_rate">>, Value}) -> + validate_pos_int(conn_evict_rate, Value); +validate_rebalance_param({<<"sess_evict_rate">>, Value}) -> + validate_pos_int(sess_evict_rate, Value); +validate_rebalance_param({<<"abs_conn_threshold">>, Value}) -> + validate_pos_int(abs_conn_threshold, Value); +validate_rebalance_param({<<"rel_conn_threshold">>, Value}) -> + validate_fraction(rel_conn_threshold, Value); +validate_rebalance_param({<<"abs_sess_threshold">>, Value}) -> + validate_pos_int(abs_sess_threshold, Value); +validate_rebalance_param({<<"rel_sess_threshold">>, Value}) -> + validate_fraction(rel_sess_threshold, Value); +validate_rebalance_param({<<"wait_takeover">>, Value}) -> + validate_pos_int(wait_takeover, Value); +validate_rebalance_param({<<"nodes">>, Value}) -> + validate_nodes(nodes, Value); +validate_rebalance_param(Value) -> + validation_error(io_lib:format("Unknown rebalance param: ~p", [Value])). + +validate_binary(Name, Value) when is_binary(Value) -> + {Name, Value}; +validate_binary(Name, _Value) -> + validation_error("invalid string in " ++ atom_to_list(Name)). + +validate_pos_int(Name, Value) -> + case is_integer(Value) andalso Value > 0 of + true -> {Name, Value}; + false -> + validation_error("invalid " ++ atom_to_list(Name) ++ " value") + end. + +validate_fraction(Name, Value) -> + case is_number(Value) andalso Value > 1.0 of + true -> {Name, Value}; + false -> + validation_error("invalid " ++ atom_to_list(Name) ++ " value") + end. + +validate_nodes(Name, NodeList) when is_list(NodeList) -> + Nodes = lists:map( + fun parse_node/1, + NodeList), + case emqx_node_rebalance_evacuation:available_nodes(Nodes) of + [] -> + validation_error(io_lib:format("no available nodes list in ~p: ~p", [Name, Nodes])); + Nodes -> + {Name, Nodes}; + OtherNodes -> + validation_error( + io_lib:format("unavailable nodes in ~p: ~p", + [Name, Nodes -- OtherNodes])) + end; +validate_nodes(Name, Nodes) -> + validation_error(io_lib:format("invalid node list in ~p: ~p", [Name, Nodes])). + +validated(Fun) -> + try + Fun() + catch throw:{validation_error, Error} -> + return({error, 400, iolist_to_binary(Error)}) + end. + +validation_error(Error) -> + throw({validation_error, Error}). + +parse_node(Bin) when is_binary(Bin) -> + try + binary_to_existing_atom(Bin) + catch + error:badarg -> + validation_error("invalid node: " ++ [Bin]) + end. + +params([{}]) -> []; +params(Params) -> Params. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_app.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_app.erl new file mode 100644 index 000000000..3cba331de --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_app.erl @@ -0,0 +1,34 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_app). + +-behaviour(application). + +-emqx_plugin(?MODULE). + +-export([ start/2 + , stop/1 + ]). + +start(_Type, _Args) -> + Env = application:get_all_env(emqx_node_rebalance), + {ok, Sup} = emqx_node_rebalance_sup:start_link(Env), + ok = emqx_node_rebalance_cli:load(), + {ok, Sup}. + +stop(_State) -> + emqx_node_rebalance_cli:unload(). diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl new file mode 100644 index 000000000..10d0912c4 --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl @@ -0,0 +1,265 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_cli). + +%% APIs +-export([ load/0 + , unload/0 + , cli/1 + ]). + +load() -> + emqx_ctl:register_command(rebalance, {?MODULE, cli}, []). + +unload() -> + emqx_ctl:unregister_command(rebalance). + +cli(["start" | StartArgs]) -> + case start_args(StartArgs) of + {evacuation, Opts} -> + case emqx_node_rebalance_evacuation:status() of + disabled -> + ok = emqx_node_rebalance_evacuation:start(Opts), + emqx_ctl:print("Rebalance(evacuation) started~n"), + true; + {enabled, _} -> + emqx_ctl:print("Rebalance is already enabled~n"), + false + end; + {rebalance, Opts} -> + case emqx_node_rebalance:start(Opts) of + ok -> + emqx_ctl:print("Rebalance started~n"), + true; + {error, Reason} -> + emqx_ctl:print("Rebalance start error: ~p~n", [Reason]), + false + end; + {error, Error} -> + emqx_ctl:print("Rebalance start error: ~s~n", [Error]), + false + end; +cli(["node-status", NodeStr]) -> + Node = list_to_atom(NodeStr), + node_status(emqx_node_rebalance_status:local_status(Node)); +cli(["node-status"]) -> + node_status(emqx_node_rebalance_status:local_status()); +cli(["status"]) -> + #{evacuations := Evacuations, + rebalances := Rebalances} = emqx_node_rebalance_status:global_status(), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print("--------------------------------------------------------------------~n"), + emqx_ctl:print("Node ~p: evacuation~n~s", + [Node, emqx_node_rebalance_status:format_local_status(Status)]) + end, + Evacuations), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print("--------------------------------------------------------------------~n"), + emqx_ctl:print("Node ~p: rebalance coordinator~n~s", + [Node, emqx_node_rebalance_status:format_coordinator_status(Status)]) + end, + Rebalances); +cli(["stop"]) -> + case emqx_node_rebalance_evacuation:status() of + {enabled, _} -> + ok = emqx_node_rebalance_evacuation:stop(), + emqx_ctl:print("Rebalance(evacuation) stopped~n"), + true; + disabled -> + case emqx_node_rebalance:status() of + {enabled, _} -> + ok = emqx_node_rebalance:stop(), + emqx_ctl:print("Rebalance stopped~n"), + true; + disabled -> + emqx_ctl:print("Rebalance is already disabled~n"), + false + end + end; +cli(_) -> + emqx_ctl:usage( + [{"rebalance start --evacuation \\\n" + " [--redirect-to \"Host1:Port1 Host2:Port2 ...\"] \\\n" + " [--conn-evict-rate CountPerSec] \\\n" + " [--migrate-to \"node1@host1 node2@host2 ...\"] \\\n" + " [--wait-takeover Secs] \\\n" + " [--sess-evict-rate CountPerSec]", + "Start current node evacuation with optional server redirect to the specified servers"}, + + {"rebalance start \\\n" + " [--nodes \"node1@host1 node2@host2\"] \\\n" + " [--wait-health-check Secs] \\\n" + " [--conn-evict-rate ConnPerSec] \\\n" + " [--abs-conn-threshold Count] \\\n" + " [--rel-conn-threshold Fraction] \\\n" + " [--conn-evict-rate ConnPerSec] \\\n" + " [--wait-takeover Secs] \\\n" + " [--sess-evict-rate CountPerSec] \\\n" + " [--abs-sess-threshold Count] \\\n" + " [--rel-sess-threshold Fraction]", + "Start current node evacuation with optional server redirect to the specified servers"}, + + {"rebalance node-status", + "Get current node rebalance status"}, + + {"rebalance node-status \"node1@host1\"", + "Get remote node rebalance status"}, + + {"rebalance status", + "Get statuses of all current rebalance/evacuation processes across the cluster"}, + + {"rebalance stop", + "Stop node rebalance"}]). + +node_status(NodeStatus) -> + case NodeStatus of + {Process, Status} when Process =:= evacuation orelse Process =:= rebalance -> + emqx_ctl:print("Rebalance type: ~p~n~s~n", + [Process, emqx_node_rebalance_status:format_local_status(Status)]); + disabled -> + emqx_ctl:print("Rebalance disabled~n"); + Other -> + emqx_ctl:print("Error detecting rebalance status: ~p~n", [Other]) + end. + +start_args(Args) -> + case collect_args(Args, #{}) of + {ok, #{"--evacuation" := true} = Collected} -> + case validate_evacuation(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {evacuation, Validated}; + {error, _} = Error -> Error + end; + {ok, #{} = Collected} -> + case validate_rebalance(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {rebalance, Validated}; + {error, _} = Error -> Error + end; + {error, _} = Error -> Error + end. + +collect_args([], Map) -> {ok, Map}; + +%% evacuation +collect_args(["--evacuation" | Args], Map) -> + collect_args(Args, Map#{"--evacuation" => true}); +collect_args(["--redirect-to", ServerReference | Args], Map) -> + collect_args(Args, Map#{"--redirect-to" => ServerReference}); +collect_args(["--migrate-to", MigrateTo | Args], Map) -> + collect_args(Args, Map#{"--migrate-to" => MigrateTo}); +%% rebalance +collect_args(["--nodes", Nodes | Args], Map) -> + collect_args(Args, Map#{"--nodes" => Nodes}); +collect_args(["--wait-health-check", WaitHealthCheck | Args], Map) -> + collect_args(Args, Map#{"--wait-health-check" => WaitHealthCheck}); +collect_args(["--abs-conn-threshold", AbsConnThres | Args], Map) -> + collect_args(Args, Map#{"--abs-conn-threshold" => AbsConnThres}); +collect_args(["--rel-conn-threshold", RelConnThres | Args], Map) -> + collect_args(Args, Map#{"--rel-conn-threshold" => RelConnThres}); +collect_args(["--abs-sess-threshold", AbsSessThres | Args], Map) -> + collect_args(Args, Map#{"--abs-sess-threshold" => AbsSessThres}); +collect_args(["--rel-sess-threshold", RelSessThres | Args], Map) -> + collect_args(Args, Map#{"--rel-sess-threshold" => RelSessThres}); +%% common +collect_args(["--conn-evict-rate", ConnEvictRate | Args], Map) -> + collect_args(Args, Map#{"--conn-evict-rate" => ConnEvictRate}); +collect_args(["--wait-takeover", WaitTakeover | Args], Map) -> + collect_args(Args, Map#{"--wait-takeover" => WaitTakeover}); +collect_args(["--sess-evict-rate", SessEvictRate | Args], Map) -> + collect_args(Args, Map#{"--sess-evict-rate" => SessEvictRate}); +%% fallback +collect_args(Args, _Map) -> + {error, io_lib:format("unknown arguments: ~p", [Args])}. + +validate_evacuation([], Map) -> + {ok, Map}; +validate_evacuation([{"--evacuation", _} | Rest], Map) -> + validate_evacuation(Rest, Map); +validate_evacuation([{"--redirect-to", ServerReference} | Rest], Map) -> + validate_evacuation(Rest, Map#{server_reference => list_to_binary(ServerReference)}); +validate_evacuation([{"--conn-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(conn_evict_rate, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--sess-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(sess_evict_rate, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--wait-takeover", _} | _] = Opts, Map) -> + validate_pos_int(wait_takeover, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--migrate-to", MigrateTo} | Rest], Map) -> + Nodes = lists:map(fun list_to_atom/1, string:tokens(MigrateTo, ", ")), + case emqx_node_rebalance_evacuation:available_nodes(Nodes) of + [] -> + {error, "invalid --migrate-to, no nodes"}; + Nodes -> + validate_evacuation(Rest, Map#{migrate_to => Nodes}); + OtherNodes -> + {error, + io_lib:format("invalid --migrate-to, unavailable nodes: ~p", + [Nodes -- OtherNodes])} + end; +validate_evacuation(Rest, _Map) -> + {error, io_lib:format("unknown evacuation arguments: ~p", [Rest])}. + +validate_rebalance([], Map) -> + {ok, Map}; +validate_rebalance([{"--wait-health-check", _} | _] = Opts, Map) -> + validate_pos_int(wait_health_check, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--conn-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(conn_evict_rate, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--sess-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(sess_evict_rate, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--abs-conn-threshold", _} | _] = Opts, Map) -> + validate_pos_int(abs_conn_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--rel-conn-threshold", _} | _] = Opts, Map) -> + validate_fraction(rel_conn_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--abs-sess-threshold", _} | _] = Opts, Map) -> + validate_pos_int(abs_sess_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--rel-sess-threshold", _} | _] = Opts, Map) -> + validate_fraction(rel_sess_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--wait-takeover", _} | _] = Opts, Map) -> + validate_pos_int(wait_takeover, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--nodes", NodeStr} | Rest], Map) -> + Nodes = lists:map(fun list_to_atom/1, string:tokens(NodeStr, ", ")), + case emqx_node_rebalance:available_nodes(Nodes) of + [] -> + {error, "invalid --nodes, no nodes"}; + Nodes -> + validate_rebalance(Rest, Map#{nodes => Nodes}); + OtherNodes -> + {error, + io_lib:format("invalid --nodes, unavailable nodes: ~p", + [Nodes -- OtherNodes])} + end; +validate_rebalance(Rest, _Map) -> + {error, io_lib:format("unknown rebalance arguments: ~p", [Rest])}. + +validate_fraction(Name, [{OptionName, Value} | Rest], Map, Next) -> + case string:to_float(Value) of + {Num, ""} when Num > 1.0 -> + Next(Rest, Map#{Name => Num}); + _ -> + {error, "invalid " ++ OptionName ++ " value"} + end. + +validate_pos_int(Name, [{OptionName, Value} | Rest], Map, Next) -> + case string:to_integer(Value) of + {Int, ""} when Int > 0 -> + Next(Rest, Map#{Name => Int}); + _ -> + {error, "invalid " ++ OptionName ++ " value"} + end. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl new file mode 100644 index 000000000..5dc99d736 --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl @@ -0,0 +1,298 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([start/1, + status/0, + stop/0 + ]). + +-export([start_link/0]). + +-behavior(gen_statem). + +-export([init/1, + callback_mode/0, + handle_event/4, + code_change/4 + ]). + +-export([is_node_available/0, + available_nodes/1]). + +-ifdef(TEST). +-export([migrate_to/1]). +-endif. + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-define(EVICT_INTERVAL_NO_NODES, 30000). + +-type migrate_to() :: [node()] | undefined. + +-type start_opts() :: #{server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer(), + migrate_to => migrate_to() + }. +-type start_error() :: already_started | eviction_agent_busy. +-type stats() :: #{ + initial_conns := non_neg_integer(), + initial_sessions := non_neg_integer(), + current_conns := non_neg_integer(), + current_sessions := non_neg_integer(), + conn_evict_rate := pos_integer(), + sess_evict_rate := pos_integer(), + server_reference := emqx_eviction_agent:server_reference(), + migrate_to := migrate_to() + }. +-type status() :: {started, stats()} | stopped. + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> status(). +status() -> + gen_statem:call(?MODULE, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec available_nodes(list(node())) -> list(node()). +available_nodes(Nodes) when is_list(Nodes) -> + {Available, _} = rpc:multicall(Nodes, ?MODULE, is_node_available, []), + lists:filter(fun is_atom/1, Available). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, evicting_conns, waiting_takeover, evicting_sessions, prohibiting + +init([]) -> + case emqx_node_rebalance_evacuation_persist:read(default_opts()) of + {ok, #{server_reference := ServerReference} = Opts} -> + ?LOG(warning, "Restoring evacuation state: ~p", [Opts]), + case emqx_eviction_agent:enable(?MODULE, ServerReference) of + ok -> + Data = init_data(#{}, Opts), + ok = warn_enabled(), + {ok, evicting_conns, Data, [{state_timeout, 0, evict_conns}]}; + {error, eviction_agent_busy} -> + emqx_node_rebalance_evacuation_persist:clear(), + {ok, disabled, #{}} + end; + none -> + {ok, disabled, #{}} + end. + +%% start +handle_event({call, From}, + {start, #{server_reference := ServerReference} = Opts}, + disabled, + #{} = Data) -> + case emqx_eviction_agent:enable(?MODULE, ServerReference) of + ok -> + NewData = init_data(Data, Opts), + ok = emqx_node_rebalance_evacuation_persist:save(Opts), + ?LOG(warning, "Node evacuation started"), + {next_state, + evicting_conns, + NewData, + [{state_timeout, 0, evict_conns}, + {reply, From, ok}]}; + {error, eviction_agent_busy} -> + {keep_state_and_data, + [{reply, From, {error, eviction_agent_busy}}]} + end; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, + [{reply, From, {error, already_started}}]}; + +%% stop +handle_event({call, From}, stop, disabled, #{}) -> + {keep_state_and_data, + [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = emqx_node_rebalance_evacuation_persist:clear(), + _ = emqx_eviction_agent:disable(?MODULE), + ?LOG(warning, "Node evacuation stopped"), + {next_state, + disabled, + deinit(Data), + [{reply, From, ok}]}; + +%% status +handle_event({call, From}, status, disabled, #{}) -> + {keep_state_and_data, + [{reply, From, disabled}]}; +handle_event({call, From}, status, State, #{migrate_to := MigrateTo} = Data) -> + Stats = maps:with( + [initial_conns, current_conns, + initial_sessions, current_sessions, + server_reference, conn_evict_rate, sess_evict_rate], + Data), + {keep_state_and_data, + [{reply, From, {enabled, Stats#{state => State, migrate_to => migrate_to(MigrateTo)}}}]}; + +%% conn eviction +handle_event(state_timeout, + evict_conns, + evicting_conns, + #{conn_evict_rate := ConnEvictRate, + wait_takeover := WaitTakeover} = Data) -> + case emqx_eviction_agent:status() of + {enabled, #{connections := Conns}} when Conns > 0 -> + ok = emqx_eviction_agent:evict_connections(ConnEvictRate), + ?tp(debug, node_evacuation_evict_conn, #{conn_evict_rate => ConnEvictRate}), + ?LOG(warning, "Node evacuation evict_conns, count=~p, conn_evict_rate=~p", + [Conns, ConnEvictRate]), + NewData = Data#{current_conns => Conns}, + {keep_state, + NewData, + [{state_timeout, ?EVICT_INTERVAL, evict_conns}]}; + {enabled, #{connections := 0}} -> + NewData = Data#{current_conns => 0}, + ?LOG(warning, "Node evacuation evict_conns over"), + {next_state, + waiting_takeover, + NewData, + [{state_timeout, timer:seconds(WaitTakeover), evict_sessions}]} + end; + +handle_event(state_timeout, + evict_sessions, + waiting_takeover, + Data) -> + ?LOG(warning, "Node evacuation wait_takeover over"), + {next_state, + evicting_sessions, + Data, + [{state_timeout, 0, evict_sessions}]}; + +%% session eviction +handle_event(state_timeout, + evict_sessions, + evicting_sessions, + #{sess_evict_rate := SessEvictRate, + migrate_to := MigrateTo, + current_sessions := CurrSessCount} = Data) -> + case emqx_eviction_agent:status() of + {enabled, #{sessions := SessCount}} when SessCount > 0 -> + case migrate_to(MigrateTo) of + [] -> + ?LOG(warning, + "No nodes are available to evacuate sessions, session_count=~p", + [CurrSessCount]), + {keep_state_and_data, + [{state_timeout, ?EVICT_INTERVAL_NO_NODES, evict_sessions}]}; + Nodes -> + ok = emqx_eviction_agent:evict_sessions(SessEvictRate, Nodes), + ?LOG(warning, "Node evacuation evict_sessions, count=~p, sess_evict_rate=~p," + "target_nodes=~p", [SessCount, SessEvictRate, Nodes]), + NewData = Data#{current_sessions => SessCount}, + {keep_state, + NewData, + [{state_timeout, ?EVICT_INTERVAL, evict_sessions}]} + end; + {enabled, #{sessions := 0}} -> + ?tp(debug, node_evacuation_evict_sess_over, #{}), + ?LOG(warning, "Node evacuation evict_sessions over"), + NewData = Data#{current_sessions => 0}, + {next_state, + prohibiting, + NewData} + end; + +handle_event({call, From}, Msg, State, Data) -> + ?LOG(warning, "Unknown call: ~p, State: ~p, Data: ~p", [Msg, State, Data]), + {keep_state_and_data, + [{reply, From, ignored}]}; + +handle_event(info, Msg, State, Data) -> + ?LOG(warning, "Unknown Msg: ~p, State: ~p, Data: ~p", [Msg, State, Data]), + keep_state_and_data; + +handle_event(cast, Msg, State, Data) -> + ?LOG(warning, "Unknown cast Msg: ~p, State: ~p, Data: ~p", [Msg, State, Data]), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +default_opts() -> + #{ + server_reference => undefined, + conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE, + sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE, + wait_takeover => ?DEFAULT_WAIT_TAKEOVER, + migrate_to => undefined + }. + +init_data(Data0, Opts) -> + Data1 = maps:merge(Data0, Opts), + {enabled, #{connections := ConnCount, sessions := SessCount}} = emqx_eviction_agent:status(), + Data1#{ + initial_conns => ConnCount, + current_conns => ConnCount, + initial_sessions => SessCount, + current_sessions => SessCount + }. + +deinit(Data) -> + Keys = [initial_conns, current_conns, initial_sessions, current_sessions] + ++ maps:keys(default_opts()), + maps:without(Keys, Data). + +warn_enabled() -> + Msg = "Node evacuation is enabled. The node will not receive connections.", + ?LOG(warning, Msg), + io:format(standard_error, "~s~n", [Msg]). + +migrate_to(undefined) -> + migrate_to(all_nodes()); +migrate_to(Nodes) when is_list(Nodes) -> + available_nodes(Nodes). + +is_node_available() -> + disabled = emqx_eviction_agent:status(), + node(). + +all_nodes() -> + ekka_mnesia:cluster_nodes(all) -- [node()]. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl new file mode 100644 index 000000000..06d8800da --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl @@ -0,0 +1,109 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_persist). + +-export([save/1, + clear/0, + read/1]). + + +-ifdef(TEST). +-export([evacuation_filepath/0]). +-endif. + +-include("emqx_node_rebalance.hrl"). +-include_lib("emqx/include/types.hrl"). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +%% do not persist `migrate_to`: +%% * after restart there is nothing to migrate +%% * this value may be invalid after node was offline +-type start_opts() :: #{server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer() + }. + +-spec save(start_opts()) -> ok_or_error(term()). +save(#{server_reference := ServerReference, + conn_evict_rate := ConnEvictRate, + sess_evict_rate := SessEvictRate, + wait_takeover := WaitTakeover} = Data) + when (is_binary(ServerReference) orelse ServerReference =:= undefined) andalso + is_integer(ConnEvictRate) andalso ConnEvictRate > 0 andalso + is_integer(SessEvictRate) andalso SessEvictRate > 0 andalso + is_integer(WaitTakeover) andalso WaitTakeover >= 0 -> + Filepath = evacuation_filepath(), + case filelib:ensure_dir(Filepath) of + ok -> + JsonData = emqx_json:encode( + prepare_for_encode(maps:with(persist_keys(), Data)), + [pretty]), + file:write_file(Filepath, JsonData); + {error, _} = Error -> Error + end. + +-spec clear() -> ok. +clear() -> + file:delete(evacuation_filepath()). + +-spec read(start_opts()) -> {ok, start_opts()} | none. +read(DefaultOpts) -> + case file:read_file(evacuation_filepath()) of + {ok, Data} -> + case emqx_json:safe_decode(Data, [return_maps]) of + {ok, Map} when is_map(Map) -> + {ok, map_to_opts(DefaultOpts, Map)}; + _NotAMap -> + {ok, DefaultOpts} + end; + {error, _} -> + none + end. + +%%-------------------------------------------------------------------- +%% Internal funcs +%%-------------------------------------------------------------------- + +persist_keys() -> + [server_reference, + conn_evict_rate, + sess_evict_rate, + wait_takeover]. + +prepare_for_encode(#{server_reference := undefined} = Data) -> + Data#{server_reference => null}; +prepare_for_encode(Data) -> Data. + +format_after_decode(#{server_reference := null} = Data) -> + Data#{server_reference => undefined}; +format_after_decode(Data) -> Data. + +map_to_opts(DefaultOpts, Map) -> + format_after_decode( + map_to_opts( + maps:to_list(DefaultOpts), Map, #{})). + +map_to_opts([], _Map, Opts) -> Opts; +map_to_opts([{Key, DefaultVal} | Rest], Map, Opts) -> + map_to_opts(Rest, Map, Opts#{Key => maps:get(atom_to_binary(Key), Map, DefaultVal)}). + +evacuation_filepath() -> + filename:join([emqx:get_env(data_dir), ?EVACUATION_FILENAME]). diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl new file mode 100644 index 000000000..4002d7c13 --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_status.erl @@ -0,0 +1,225 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_status). + +-export([local_status/0, + local_status/1, + global_status/0, + format_local_status/1, + format_coordinator_status/1]). + +%% For RPC +-export([evacuation_status/0, + rebalance_status/0]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-spec local_status() -> disabled | {evacuation, map()} | {rebalance, map()}. +local_status() -> + case emqx_node_rebalance_evacuation:status() of + {enabled, Status} -> + {evacuation, evacuation(Status)}; + disabled -> + case emqx_node_rebalance_agent:status() of + {enabled, CoordinatorPid} -> + case emqx_node_rebalance:status(CoordinatorPid) of + {enabled, Status} -> + local_rebalance(Status, node()); + disabled -> + disabled + end; + disabled -> + disabled + end + end. + +-spec local_status(node()) -> disabled | {evacuation, map()} | {rebalance, map()}. +local_status(Node) -> + rpc:call(Node, ?MODULE, ?FUNCTION_NAME, []). + +-spec format_local_status(map()) -> iodata(). +format_local_status(Status) -> + format_status(Status, local_status_field_format_order()). + +-spec global_status() -> #{rebalances := [{node(), map()}], evacuations := [{node(), map()}]}. +global_status() -> + Nodes = ekka_mnesia:cluster_nodes(all), + {RebalanceResults, _} = rpc:multicall(Nodes, ?MODULE, rebalance_status, []), + Rebalances = [{Node, coordinator_rebalance(Status)} || {Node, {enabled, Status}} <- RebalanceResults], + {EvacuatioResults, _} = rpc:multicall(Nodes, ?MODULE, evacuation_status, []), + Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuatioResults], + #{rebalances => Rebalances, evacuations => Evacuations}. + +-spec format_coordinator_status(map()) -> iodata(). +format_coordinator_status(Status) -> + format_status(Status, coordinator_status_field_format_order()). + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +evacuation(Status) -> + #{ + state => maps:get(state, Status), + connection_eviction_rate => maps:get(conn_evict_rate, Status), + session_eviction_rate => maps:get(sess_evict_rate, Status), + connection_goal => 0, + session_goal => 0, + session_recipients => maps:get(migrate_to, Status), + stats => #{ + initial_connected => maps:get(initial_conns, Status), + current_connected => maps:get(current_conns, Status), + initial_sessions => maps:get(initial_sessions, Status), + current_sessions => maps:get(current_sessions, Status) + } + }. + +local_rebalance(#{donors := Donors} = Stats, Node) -> + case lists:member(Node, Donors) of + true -> {rebalance, donor_rebalance(Stats, Node)}; + false -> disabled + end. + +donor_rebalance(Status, Node) -> + Opts = maps:get(opts, Status), + InitialConnCounts = maps:get(initial_conn_counts, Status), + InitialSessCounts = maps:get(initial_sess_counts, Status), + + CurrentStats = #{ + initial_connected => maps:get(Node, InitialConnCounts), + initial_sessions => maps:get(Node, InitialSessCounts), + current_connected => emqx_eviction_agent:connection_count(), + current_sessions => emqx_eviction_agent:session_count(), + current_disconnected_sessions => emqx_eviction_agent:session_count( + disconnected) + }, + maps:from_list( + [ + {state, maps:get(state, Status)}, + {coordinator_node, maps:get(coordinator_node, Status)}, + {connection_eviction_rate, maps:get(conn_evict_rate, Opts)}, + {session_eviction_rate, maps:get(sess_evict_rate, Opts)}, + {recipients, maps:get(recipients, Status)}, + {stats, CurrentStats} + ] ++ + [{connection_goal, maps:get(recipient_conn_avg, Status)} + || maps:is_key(recipient_conn_avg, Status) + ] ++ + [{disconnected_session_goal, maps:get(recipient_sess_avg, Status)} + || maps:is_key(recipient_sess_avg, Status) + ]). + +coordinator_rebalance(Status) -> + Opts = maps:get(opts, Status), + maps:from_list( + [ + {state, maps:get(state, Status)}, + {coordinator_node, maps:get(coordinator_node, Status)}, + {connection_eviction_rate, maps:get(conn_evict_rate, Opts)}, + {session_eviction_rate, maps:get(sess_evict_rate, Opts)}, + {recipients, maps:get(recipients, Status)}, + {donors, maps:get(donors, Status)} + ] ++ + [{connection_goal, maps:get(recipient_conn_avg, Status)} + || maps:is_key(recipient_conn_avg, Status) + ] ++ + [{disconnected_session_goal, maps:get(recipient_sess_avg, Status)} + || maps:is_key(recipient_sess_avg, Status) + ] ++ + [{donor_conn_avg, maps:get(donor_conn_avg, Status)} + || maps:is_key(donor_conn_avg, Status) + ] ++ + [{donor_sess_avg, maps:get(donor_sess_avg, Status)} + || maps:is_key(donor_sess_avg, Status) + ]). + +local_status_field_format_order() -> + [state, + coordinator_node, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + session_goal, + disconnected_session_goal, + session_recipients, + recipients, + stats]. + +coordinator_status_field_format_order() -> + [state, + coordinator_node, + donors, + recipients, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + disconnected_session_goal, + donor_conn_avg, + donor_sess_avg]. + +format_status(Status, FieldOrder) -> + Fields = lists:flatmap( + fun(FieldName) -> + maps:to_list(maps:with([FieldName], Status)) + end, + FieldOrder), + lists:map( + fun format_local_status_field/1, + Fields). + +format_local_status_field({state, State}) -> + io_lib:format("Rebalance state: ~p~n", [State]); +format_local_status_field({coordinator_node, Node}) -> + io_lib:format("Coordinator node: ~p~n", [Node]); +format_local_status_field({connection_eviction_rate, ConnEvictRate}) -> + io_lib:format("Connection eviction rate: ~p connections/second~n", [ConnEvictRate]); +format_local_status_field({session_eviction_rate, SessEvictRate}) -> + io_lib:format("Session eviction rate: ~p sessions/second~n", [SessEvictRate]); +format_local_status_field({connection_goal, ConnGoal}) -> + io_lib:format("Connection goal: ~p~n", [ConnGoal]); +format_local_status_field({session_goal, SessGoal}) -> + io_lib:format("Session goal: ~p~n", [SessGoal]); +format_local_status_field({disconnected_session_goal, DisconnSessGoal}) -> + io_lib:format("Disconnected session goal: ~p~n", [DisconnSessGoal]); +format_local_status_field({session_recipients, SessionRecipients}) -> + io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]); +format_local_status_field({recipients, Recipients}) -> + io_lib:format("Recipient nodes: ~p~n", [Recipients]); +format_local_status_field({donors, Donors}) -> + io_lib:format("Donor nodes: ~p~n", [Donors]); +format_local_status_field({donor_conn_avg, DonorConnAvg}) -> + io_lib:format("Current average donor node connection count: ~p~n", [DonorConnAvg]); +format_local_status_field({donor_sess_avg, DonorSessAvg}) -> + io_lib:format("Current average donor node disconnected session count: ~p~n", [DonorSessAvg]); +format_local_status_field({stats, Stats}) -> + format_local_stats(Stats). + +format_local_stats(Stats) -> + ["Channel statistics:\n" | + lists:map( + fun({Name, Value}) -> + io_lib:format(" ~p: ~p~n", [Name, Value]) + end, + maps:to_list(Stats))]. + +evacuation_status() -> + {node(), emqx_node_rebalance_evacuation:status()}. + +rebalance_status() -> + {node(), emqx_node_rebalance:status()}. diff --git a/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl b/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl new file mode 100644 index 000000000..e677eb22e --- /dev/null +++ b/apps/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl @@ -0,0 +1,44 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_sup). + +-behaviour(supervisor). + +-export([start_link/1]). + +-export([init/1]). + +start_link(Env) -> + supervisor:start_link({local, ?MODULE}, ?MODULE, [Env]). + +init([_Env]) -> + Childs = [child_spec(emqx_node_rebalance_evacuation, []), + child_spec(emqx_node_rebalance_agent, []), + child_spec(emqx_node_rebalance, [])], + {ok, { + {one_for_one, 10, 3600}, + Childs} + }. + +child_spec(Mod, Args) -> + #{id => Mod, + start => {Mod, start_link, Args}, + restart => permanent, + shutdown => 5000, + type => worker, + modules => [Mod] + }. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl new file mode 100644 index 000000000..c5c27aec0 --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl @@ -0,0 +1,183 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import(emqx_eviction_agent_test_helpers, + [emqtt_connect_many/1]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_node_rebalance, emqx_eviction_agent]), + Config. + +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance:stop(), + Node = emqx_node_helpers:start_slave( + recipient1, + #{start_apps => [emqx, emqx_eviction_agent, emqx_node_rebalance]}), + [{recipient_node, Node} | Config]. + +end_per_testcase(_Case, Config) -> + _ = emqx_node_helpers:stop_slave(?config(recipient_node, Config)), + _ = emqx_node_rebalance:stop(). + +t_rebalance(Config) -> + process_flag(trap_exit, true), + RecipientNode = ?config(recipient_node, Config), + + Nodes = [node(), RecipientNode], + + _Conns = emqtt_connect_many(500), + + Opts = #{conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?check_trace( + ?wait_async_action( + emqx_node_rebalance:start(Opts), + #{?snk_kind := emqx_node_rebalance_evict_sess_over}, + 10000), + fun({ok, _}, Trace) -> + ?assertMatch( + [_ | _], + ?of_kind(emqx_node_rebalance_evict_sess_over, Trace)) + end), + + DonorConnCount = emqx_eviction_agent:connection_count(), + DonorSessCount = emqx_eviction_agent:session_count(), + DonorDSessCount = emqx_eviction_agent:session_count(disconnected), + + RecipientConnCount = rpc:call(RecipientNode, emqx_eviction_agent, connection_count, []), + RecipientSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, []), + RecipientDSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, [disconnected]), + + ct:pal("Donor: conn=~p, sess=~p, dsess=~p", + [DonorConnCount, DonorSessCount, DonorDSessCount]), + ct:pal("Recipient: conn=~p, sess=~p, dsess=~p", + [RecipientConnCount, RecipientSessCount, RecipientDSessCount]), + + ?assert(DonorConnCount - 50 =< RecipientConnCount), + ?assert(DonorDSessCount - 50 =< RecipientDSessCount). + +t_rebalance_node_crash(Config) -> + process_flag(trap_exit, true), + RecipientNode = ?config(recipient_node, Config), + + Nodes = [node(), RecipientNode], + + _Conns = emqtt_connect_many(50), + + Opts = #{conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ok = emqx_node_rebalance:start(Opts), + + ?check_trace( + ?wait_async_action( + emqx_node_helpers:stop_slave(?config(recipient_node, Config)), + #{?snk_kind := emqx_node_rebalance_started}, + 1000), + fun(_Result, _Trace) -> ok end), + + ?assertEqual( + disabled, + emqx_node_rebalance:status()). + +t_no_need_to_rebalance(_Config) -> + process_flag(trap_exit, true), + + ?assertEqual( + {error, nothing_to_balance}, + emqx_node_rebalance:start(#{})), + + _Conns = emqtt_connect_many(50), + + ?assertEqual( + {error, nothing_to_balance}, + emqx_node_rebalance:start(#{})). + +t_unknown_mesages(Config) -> + process_flag(trap_exit, true), + RecipientNode = ?config(recipient_node, Config), + + Nodes = [node(), RecipientNode], + + _Conns = emqtt_connect_many(500), + + Opts = #{wait_health_check => 100, + abs_conn_threshold => 50, + nodes => Nodes + }, + + Pid = whereis(emqx_node_rebalance), + + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown)), + + ok = emqx_node_rebalance:start(Opts), + + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown)). + +t_available_nodes(Config) -> + rpc:call(?config(recipient_node, Config), + emqx_eviction_agent, + enable, + [test_rebalance, undefined]), + ?assertEqual( + [node()], + emqx_node_rebalance:available_nodes( + [node(), ?config(recipient_node, Config)])). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl new file mode 100644 index 000000000..30edd51fe --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl @@ -0,0 +1,163 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_agent_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_node_rebalance, emqx_eviction_agent]), + Config. + +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + Node = emqx_node_helpers:start_slave( + evacuate1, + #{start_apps => [emqx, emqx_eviction_agent, emqx_node_rebalance]}), + [{evacuate_node, Node} | Config]. + +end_per_testcase(_Case, Config) -> + _ = emqx_node_helpers:stop_slave(?config(evacuate_node, Config)), + _ = emqx_node_rebalance_evacuation:stop(). + +t_enable_disable(_Config) -> + ?assertEqual( + disabled, + emqx_node_rebalance_agent:status()), + + ?assertEqual( + ok, + emqx_node_rebalance_agent:enable(self())), + + ?assertEqual( + {error, already_enabled}, + emqx_node_rebalance_agent:enable(self())), + + ?assertEqual( + {enabled, self()}, + emqx_node_rebalance_agent:status()), + + ?assertEqual( + {error, invalid_coordinator}, + emqx_node_rebalance_agent:disable(spawn_link(fun() -> ok end))), + + ?assertEqual( + ok, + emqx_node_rebalance_agent:disable(self())), + + ?assertEqual( + {error, already_disabled}, + emqx_node_rebalance_agent:disable(self())), + + ?assertEqual( + disabled, + emqx_node_rebalance_agent:status()). + +t_enable_egent_busy(_Config) -> + ok = emqx_eviction_agent:enable(rebalance_test, undefined), + + ?assertEqual( + {error, eviction_agent_busy}, + emqx_node_rebalance_agent:enable(self())), + + ok = emqx_eviction_agent:disable(rebalance_test). + +% The following tests verify that emqx_node_rebalance_agent correctly links +% coordinator process with emqx_eviction_agent-s. + +t_rebalance_agent_coordinator_fail(Config) -> + process_flag(trap_exit, true), + + Node = ?config(evacuate_node, Config), + + + CoordinatorPid = spawn_link( + fun() -> + receive + done -> ok + end + end), + + ?assertEqual( + disabled, + rpc:call(Node, emqx_eviction_agent, status, [])), + + ?assertEqual( + ok, + rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid])), + + ?assertMatch( + {enabled, _}, + rpc:call(Node, emqx_eviction_agent, status, [])), + + EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]), + true = link(EvictionAgentPid), + + true = exit(CoordinatorPid, kill), + + receive + {'EXIT', EvictionAgentPid, _} -> true + after + 1000 -> ?assert(false, "emqx_eviction_agent did not exit") + end. + +t_rebalance_agent_fail(Config) -> + process_flag(trap_exit, true), + + Node = ?config(evacuate_node, Config), + + CoordinatorPid = spawn_link( + fun() -> + receive + done -> ok + end + end), + + ?assertEqual( + ok, + rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid])), + + EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]), + true = exit(EvictionAgentPid, kill), + + receive + {'EXIT', CoordinatorPid, _} -> true + after + 1000 -> ?assert(false, "emqx_eviction_agent did not exit") + end. + +t_unknown_messages(_Config) -> + Pid = whereis(emqx_node_rebalance_agent), + + ok = gen_server:cast(Pid, unknown), + + Pid ! unknown, + + ignored = gen_server:call(Pid, unknown). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl new file mode 100644 index 000000000..a15b8e8ab --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -0,0 +1,321 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import(emqx_mgmt_api_test_helpers, + [request_api/3, + request_api/5, + auth_header_/0, + api_path/1]). + +-import(emqx_eviction_agent_test_helpers, + [emqtt_connect_many/1]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance, emqx_management]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_management, emqx_node_rebalance, emqx_eviction_agent]), + Config. + +init_per_testcase(Case, Config) + when Case =:= t_start_evacuation_validation + orelse Case =:= t_start_rebalance_validation + orelse Case =:= t_start_stop_rebalance -> + _ = emqx_node_rebalance:stop(), + _ = emqx_node_rebalance_evacuation:stop(), + Node = emqx_node_helpers:start_slave( + recipient1, + #{start_apps => [emqx, emqx_eviction_agent, emqx_node_rebalance]}), + [{recipient_node, Node} | Config]; +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance:stop(), + _ = emqx_node_rebalance_evacuation:stop(), + Config. + +end_per_testcase(Case, Config) + when Case =:= t_start_evacuation_validation + orelse Case =:= t_start_rebalance_validation + orelse Case =:= t_start_stop_rebalance -> + _ = emqx_node_helpers:stop_slave(?config(recipient_node, Config)), + _ = emqx_node_rebalance:stop(), + _ = emqx_node_rebalance_evacuation:stop(); +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance:stop(), + _ = emqx_node_rebalance_evacuation:stop(). + +t_start_evacuation_validation(Config) -> + BadOpts = [#{conn_evict_rate => <<"conn">>}, + #{sess_evict_rate => <<"sess">>}, + #{redirect_to => 123}, + #{wait_takeover => <<"wait">>}, + #{migrate_to => []}, + #{migrate_to => <<"migrate_to">>}, + #{migrate_to => [<<"bad_node">>]}, + #{migrate_to => [<<"bad_node">>, atom_to_binary(node())]}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "evacuation", "start"], + Opts)), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])) + + end, + BadOpts), + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", "bad@node", "evacuation", "start"], + #{})), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])), + + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "evacuation", "start"], + #{conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 10, + redirect_to => <<"srv">>, + migrate_to => [atom_to_binary(?config(recipient_node, Config))]})), + + ?assertMatch( + {ok, #{<<"status">> := <<"enabled">>}}, + api_get(["load_rebalance", "status"])). + + +t_start_rebalance_validation(Config) -> + BadOpts = [#{conn_evict_rate => <<"conn">>}, + #{sess_evict_rate => <<"sess">>}, + #{abs_conn_threshold => <<"act">>}, + #{rel_conn_threshold => <<"rct">>}, + #{abs_sess_threshold => <<"act">>}, + #{rel_sess_threshold => <<"rct">>}, + #{wait_takeover => <<"wait">>}, + #{wait_health_check => <<"wait">>}, + #{nodes => <<"nodes">>}, + #{nodes => []}, + #{nodes => [<<"bad_node">>]}, + #{nodes => [<<"bad_node">>, atom_to_binary(node())]}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "start"], + Opts)), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])) + + end, + BadOpts), + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", "bad@node", "start"], + #{})), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])), + + _Conns = emqtt_connect_many(50), + + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "start"], + #{conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 10, + wait_health_check => 10, + abs_conn_threshold => 10, + rel_conn_threshold => 1.001, + abs_sess_threshold => 10, + rel_sess_threshold => 1.001, + nodes => [atom_to_binary(?config(recipient_node, Config)), + atom_to_binary(node())]})), + + ?assertMatch( + {ok, #{<<"status">> := <<"enabled">>}}, + api_get(["load_rebalance", "status"])). + +t_start_stop_evacuation(_Config) -> + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])), + + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "evacuation", "start"], + #{conn_evict_rate => 10, + sess_evict_rate => 20})), + + ?assertMatch( + {ok, #{<<"state">> := _, + <<"process">> := <<"evacuation">>, + <<"connection_eviction_rate">> := 10, + <<"session_eviction_rate">> := 20, + <<"connection_goal">> := 0, + <<"session_goal">> := 0, + <<"stats">> := #{ + <<"initial_connected">> := _, + <<"current_connected">> := _, + <<"initial_sessions">> := _, + <<"current_sessions">> := _ + }}}, + api_get(["load_rebalance", "status"])), + + ?assertMatch( + {ok, #{<<"rebalances">> := #{}, + <<"evacuations">> := + #{<<"test@127.0.0.1">> := #{<<"state">> := _, + <<"connection_eviction_rate">> := 10, + <<"session_eviction_rate">> := 20, + <<"connection_goal">> := 0, + <<"session_goal">> := 0, + <<"stats">> := #{ + <<"initial_connected">> := _, + <<"current_connected">> := _, + <<"initial_sessions">> := _, + <<"current_sessions">> := _ + } + } + }}}, + api_get(["load_rebalance", "global_status"])), + + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "evacuation", "stop"], + #{})), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])), + + ?assertMatch( + {ok, #{<<"evacuations">> := #{}, <<"rebalances">> := #{}}}, + api_get(["load_rebalance", "global_status"])). + +t_start_stop_rebalance(Config) -> + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])), + + _Conns = emqtt_connect_many(100), + + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "start"], + #{conn_evict_rate => 10, + sess_evict_rate => 20, + abs_conn_threshold => 10})), + + ?assertMatch( + {ok, #{<<"state">> := _, + <<"process">> := <<"rebalance">>, + <<"coordinator_node">> := _, + <<"connection_eviction_rate">> := 10, + <<"session_eviction_rate">> := 20, + <<"stats">> := #{ + <<"initial_connected">> := _, + <<"current_connected">> := _, + <<"initial_sessions">> := _, + <<"current_sessions">> := _ + }}}, + api_get(["load_rebalance", "status"])), + + DonorNode = atom_to_binary(node()), + RecipientNode = atom_to_binary(?config(recipient_node, Config)), + + ?assertMatch( + {ok, #{<<"evacuations">> := #{}, + <<"rebalances">> := + #{<<"test@127.0.0.1">> := #{<<"state">> := _, + <<"coordinator_node">> := _, + <<"connection_eviction_rate">> := 10, + <<"session_eviction_rate">> := 20, + <<"donors">> := [DonorNode], + <<"recipients">> := [RecipientNode] + } + }}}, + api_get(["load_rebalance", "global_status"])), + + ?assertMatch( + {ok, #{}}, + api_post(["load_rebalance", atom_to_list(node()), "stop"], + #{})), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"])), + + ?assertMatch( + {ok, #{<<"evacuations">> := #{}, <<"rebalances">> := #{}}}, + api_get(["load_rebalance", "global_status"])). + +t_availability_check(_Config) -> + + ?assertMatch( + {ok, #{}}, + api_get(["load_rebalance", "availability_check"])), + + ok = emqx_node_rebalance_evacuation:start(#{}), + + ?assertMatch( + {error, {_, 503, _}}, + api_get(["load_rebalance", "availability_check"])), + + ok = emqx_node_rebalance_evacuation:stop(), + + ?assertMatch( + {ok, #{}}, + api_get(["load_rebalance", "availability_check"])). + +api_get(Path) -> + case request_api(get, api_path(Path), auth_header_()) of + {ok, ResponseBody} -> + {ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])}; + {error, _} = Error -> Error + end. + +api_post(Path, Data) -> + case request_api(post, api_path(Path), [], auth_header_(), Data) of + {ok, ResponseBody} -> + {ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])}; + {error, _} = Error -> Error + end. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl new file mode 100644 index 000000000..943746298 --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl @@ -0,0 +1,199 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_cli_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import(emqx_eviction_agent_test_helpers, + [emqtt_connect_many/1]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_node_rebalance, emqx_eviction_agent]), + Config. + + +init_per_testcase(t_rebalance, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + Node = emqx_node_helpers:start_slave( + evacuate1, + #{start_apps => [emqx, emqx_eviction_agent, emqx_node_rebalance]}), + [{evacuate_node, Node} | Config]; +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(), + Config. + +end_per_testcase(t_rebalance, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(), + _ = emqx_node_helpers:stop_slave(?config(evacuate_node, Config)); +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(). + +t_evacuation(_Config) -> + %% usage + ok = emqx_node_rebalance_cli:cli(["foobar"]), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--foo-bar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--conn-evict-rate", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--sess-evict-rate", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--wait-takeover", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", + "--migrate-to", "nonexistent@node"])), + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", + "--migrate-to", ""])), + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", + "--unknown-arg"])), + ?assert( + emqx_node_rebalance_cli:cli(["start", "--evacuation", + "--conn-evict-rate", "10", + "--sess-evict-rate", "10", + "--wait-takeover", "10", + "--migrate-to", atom_to_list(node()), + "--redirect-to", "srv"])), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + ?assertMatch( + {enabled, #{}}, + emqx_node_rebalance_evacuation:status()), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", + "--conn-evict-rate", "10", + "--redirect-to", "srv"])), + + %% stop + true = emqx_node_rebalance_cli:cli(["stop"]), + + false = emqx_node_rebalance_cli:cli(["stop"]), + + ?assertEqual( + disabled, + emqx_node_rebalance_evacuation:status()). + +t_rebalance(Config) -> + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--foo-bar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--conn-evict-rate", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--abs-conn-threshold", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--rel-conn-threshold", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--sess-evict-rate", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--abs-sess-threshold", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--rel-sess-threshold", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--wait-takeover", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--wait-health-check", "foobar"])), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", + "--nodes", "nonexistent@node"])), + ?assertNot( + emqx_node_rebalance_cli:cli(["start", + "--nodes", ""])), + ?assertNot( + emqx_node_rebalance_cli:cli(["start", + "--nodes", atom_to_list(?config(evacuate_node, Config))])), + ?assertNot( + emqx_node_rebalance_cli:cli(["start", + "--unknown-arg"])), + + _ = emqtt_connect_many(20), + + ?assert( + emqx_node_rebalance_cli:cli(["start", + "--conn-evict-rate", "10", + "--abs-conn-threshold", "10", + "--rel-conn-threshold", "1.1", + "--sess-evict-rate", "10", + "--abs-sess-threshold", "10", + "--rel-sess-threshold", "1.1", + "--wait-takeover", "10", + "--nodes", atom_to_list(node()) ++ "," + ++ atom_to_list(?config(evacuate_node, Config)) + ])), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + ?assertMatch( + {enabled, #{}}, + emqx_node_rebalance:status()), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli:cli(["start"])), + + %% stop + true = emqx_node_rebalance_cli:cli(["stop"]), + + false = emqx_node_rebalance_cli:cli(["stop"]), + + ?assertEqual( + disabled, + emqx_node_rebalance:status()). diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl new file mode 100644 index 000000000..23f35f61e --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl @@ -0,0 +1,194 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import(emqx_eviction_agent_test_helpers, + [emqtt_connect/0, emqtt_connect/2, emqtt_try_connect/0]). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_node_rebalance, emqx_eviction_agent]), + Config. + +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + Node = emqx_node_helpers:start_slave( + evacuate1, + #{start_apps => [emqx, emqx_eviction_agent]}), + [{evacuate_node, Node} | Config]. + +end_per_testcase(_Case, Config) -> + _ = emqx_node_helpers:stop_slave(?config(evacuate_node, Config)), + _ = emqx_node_rebalance_evacuation:stop(). + +t_agent_busy(Config) -> + + ok = emqx_eviction_agent:enable(other_rebalance, undefined), + + ?assertEqual( + {error, eviction_agent_busy}, + emqx_node_rebalance_evacuation:start(opts(Config))), + + emqx_eviction_agent:disable(other_rebalance). + + +t_already_started(Config) -> + ok = emqx_node_rebalance_evacuation:start(opts(Config)), + + ?assertEqual( + {error, already_started}, + emqx_node_rebalance_evacuation:start(opts(Config))), + + ok = emqx_node_rebalance_evacuation:stop(). + +t_not_started(_Config) -> + ?assertEqual( + {error, not_started}, + emqx_node_rebalance_evacuation:stop()). + +t_start(Config) -> + process_flag(trap_exit, true), + + ok = emqx_node_rebalance_evacuation:start(opts(Config)), + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect()), + ok = emqx_node_rebalance_evacuation:stop(). + +t_persistence(Config) -> + process_flag(trap_exit, true), + + ok = emqx_node_rebalance_evacuation:start(opts(Config)), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect()), + + ok = supervisor:terminate_child(emqx_node_rebalance_sup, emqx_node_rebalance_evacuation), + {ok, _} = supervisor:restart_child(emqx_node_rebalance_sup, emqx_node_rebalance_evacuation), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect()), + ?assertMatch( + {enabled, #{conn_evict_rate := 10}}, + emqx_node_rebalance_evacuation:status()), + + ok = emqx_node_rebalance_evacuation:stop(). + +t_conn_evicted(Config) -> + process_flag(trap_exit, true), + + {ok, C} = emqtt_connect(), + + ?check_trace( + ?wait_async_action( + emqx_node_rebalance_evacuation:start(opts(Config)), + #{?snk_kind := node_evacuation_evict_conn}, + 1000), + fun(_Result, _Trace) -> ok end), + + ct:sleep(100), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect()), + + ?assertNot( + is_process_alive(C)). + +t_migrate_to(Config) -> + ?assertEqual( + [?config(evacuate_node, Config)], + emqx_node_rebalance_evacuation:migrate_to(undefined)), + + ?assertEqual( + [], + emqx_node_rebalance_evacuation:migrate_to(['unknown@node'])), + + rpc:call(?config(evacuate_node, Config), + emqx_eviction_agent, + enable, + [test_rebalance, undefined]), + + ?assertEqual( + [], + emqx_node_rebalance_evacuation:migrate_to(undefined)). + +t_session_evicted(Config) -> + process_flag(trap_exit, true), + + {ok, C} = emqtt_connect(<<"client_with_sess">>, false), + + ?check_trace( + ?wait_async_action( + emqx_node_rebalance_evacuation:start(opts(Config)), + #{?snk_kind := node_evacuation_evict_sess_over}, + 5000), + fun(_Result, Trace) -> + ?assertMatch( + [_ | _], + ?of_kind(node_evacuation_evict_sess_over, Trace)) + end), + + receive + {'EXIT', C, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok + after 1000 -> + ?assert(false, "Connection not evicted") + end, + + [ChannelPid] = emqx_cm_registry:lookup_channels(<<"client_with_sess">>), + + ?assertEqual( + ?config(evacuate_node, Config), + node(ChannelPid)). + +t_unknown_messages(Config) -> + ok = emqx_node_rebalance_evacuation:start(opts(Config)), + + whereis(emqx_node_rebalance_evacuation) ! unknown, + + gen_server:cast(emqx_node_rebalance_evacuation, unknown), + + ?assertEqual( + ignored, + gen_server:call(emqx_node_rebalance_evacuation, unknown)), + + ok = emqx_node_rebalance_evacuation:stop(). + +opts(Config) -> + #{ + server_reference => <<"srv">>, + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 1, + migrate_to => [?config(evacuate_node, Config)] + }. diff --git a/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl new file mode 100644 index 000000000..ba1a12775 --- /dev/null +++ b/apps/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl @@ -0,0 +1,107 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_persist_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +all() -> + emqx_ct:all(?MODULE). + +init_per_suite(Config) -> + emqx_ct_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(Config) -> + emqx_ct_helpers:stop_apps([emqx_node_rebalance, emqx_eviction_agent]), + Config. + +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation_persist:clear(), + Config. + +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance_evacuation_persist:clear(). + +t_save_read(_Config) -> + DefaultOpts = #{server_reference => <<"default_ref">>, + conn_evict_rate => 2001, + sess_evict_rate => 2002, + wait_takeover => 2003 + }, + + Opts0 = #{server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + ok = emqx_node_rebalance_evacuation_persist:save(Opts0), + + {ok, ReadOpts0} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(Opts0, ReadOpts0), + + Opts1 = Opts0#{server_reference => undefined}, + ok = emqx_node_rebalance_evacuation_persist:save(Opts1), + + {ok, ReadOpts1} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(Opts1, ReadOpts1). + +t_read_default(_Config) -> + ok = write_evacuation_file(<<"{}">>), + + DefaultOpts = #{server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + + {ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(DefaultOpts, ReadOpts). + +t_read_bad_data(_Config) -> + ok = write_evacuation_file(<<"{bad json">>), + + DefaultOpts = #{server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + + {ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(DefaultOpts, ReadOpts). + +t_clear(_Config) -> + ok = write_evacuation_file(<<"{}">>), + + ?assertMatch( + {ok, _}, + emqx_node_rebalance_evacuation_persist:read(#{})), + + ok = emqx_node_rebalance_evacuation_persist:clear(), + + ?assertEqual( + none, + emqx_node_rebalance_evacuation_persist:read(#{})). + +write_evacuation_file(Json) -> + ok = filelib:ensure_dir(emqx_node_rebalance_evacuation_persist:evacuation_filepath()), + ok = file:write_file( + emqx_node_rebalance_evacuation_persist:evacuation_filepath(), + Json). diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src index 259efa08d..90620e2df 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.app.src +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.app.src @@ -1,6 +1,6 @@ {application, emqx_rule_engine, [{description, "EMQ X Rule Engine"}, - {vsn, "4.3.13"}, % strict semver, bump manually! + {vsn, "4.3.14"}, % strict semver, bump manually! {modules, []}, {registered, [emqx_rule_engine_sup, emqx_rule_registry]}, {applications, [kernel,stdlib,rulesql,getopt]}, diff --git a/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src b/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src index ba39c4ec9..1c5fb0e7b 100644 --- a/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src +++ b/apps/emqx_rule_engine/src/emqx_rule_engine.appup.src @@ -1,7 +1,8 @@ %% -*- mode: erlang -*- %% Unless you know what you are doing, DO NOT edit manually!! {VSN, - [{"4.3.12",[{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, + [{"4.3.13",[{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, + {"4.3.12",[{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, {"4.3.11", [{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_validator,brutal_purge,soft_purge,[]}, @@ -165,7 +166,8 @@ {load_module,emqx_rule_runtime,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_engine_api,brutal_purge,soft_purge,[]}]}, {<<".*">>,[]}], - [{"4.3.12",[{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, + [{"4.3.13",[{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, + {"4.3.12",[{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}]}, {"4.3.11", [{load_module,emqx_rule_registry,brutal_purge,soft_purge,[]}, {load_module,emqx_rule_validator,brutal_purge,soft_purge,[]}, diff --git a/data/loaded_plugins.tmpl b/data/loaded_plugins.tmpl index d0dac7fe1..62b3205cd 100644 --- a/data/loaded_plugins.tmpl +++ b/data/loaded_plugins.tmpl @@ -6,3 +6,5 @@ {emqx_telemetry, {{enable_plugin_emqx_telemetry}}}. {emqx_rule_engine, {{enable_plugin_emqx_rule_engine}}}. {emqx_bridge_mqtt, {{enable_plugin_emqx_bridge_mqtt}}}. +{emqx_eviction_agent, true}. +{emqx_node_rebalance, true}. diff --git a/lib-ce/emqx_dashboard/src/emqx_dashboard.app.src b/lib-ce/emqx_dashboard/src/emqx_dashboard.app.src index 1281145f0..3c9a4e3fa 100644 --- a/lib-ce/emqx_dashboard/src/emqx_dashboard.app.src +++ b/lib-ce/emqx_dashboard/src/emqx_dashboard.app.src @@ -1,6 +1,6 @@ {application, emqx_dashboard, [{description, "EMQ X Web Dashboard"}, - {vsn, "4.3.14"}, % strict semver, bump manually! + {vsn, "4.3.15"}, % strict semver, bump manually! {modules, []}, {registered, [emqx_dashboard_sup]}, {applications, [kernel,stdlib,mnesia,minirest]}, diff --git a/rebar.config.erl b/rebar.config.erl index 78e1ef94e..04ffaefbd 100644 --- a/rebar.config.erl +++ b/rebar.config.erl @@ -51,6 +51,8 @@ overrides() -> [ {add, [ {extra_src_dirs, [{"etc", [{recursive,true}]}]} , {erl_opts, [{compile_info, [{emqx_vsn, get_vsn()}]}]} ]} + + , {add, relx, [{erl_opts, [{d, 'RLX_LOG', rlx_log}]}]} , {add, snabbkaffe, [{erl_opts, common_compile_opts()}]} ] ++ community_plugin_overrides(). @@ -295,6 +297,8 @@ relx_plugin_apps(ReleaseType) -> , emqx_recon , emqx_rule_engine , emqx_sasl + , emqx_eviction_agent + , emqx_node_rebalance ] ++ [emqx_telemetry || not is_enterprise()] ++ relx_plugin_apps_per_rel(ReleaseType) diff --git a/src/emqx.app.src b/src/emqx.app.src index 14a6f4fa4..14c63c75e 100644 --- a/src/emqx.app.src +++ b/src/emqx.app.src @@ -6,7 +6,7 @@ %% the emqx `release' version, which in turn is comprised of several %% apps, one of which is this. See `emqx_release.hrl' for more %% info. - {vsn, "4.3.19"}, % strict semver, bump manually! + {vsn, "4.3.20"}, % strict semver, bump manually! {modules, []}, {registered, []}, {applications, [ kernel diff --git a/src/emqx.appup.src b/src/emqx.appup.src index 70d18ff48..b2be51df5 100644 --- a/src/emqx.appup.src +++ b/src/emqx.appup.src @@ -1,12 +1,19 @@ %% -*- mode: erlang -*- %% Unless you know what you are doing, DO NOT edit manually!! {VSN, - [ + [{"4.3.19", + [{load_module,emqx_plugins,brutal_purge,soft_purge,[]}, + {load_module,emqx_channel,brutal_purge,soft_purge,[]}, + {load_module,emqx_cm,brutal_purge,soft_purge,[]}]}, {"4.3.18", - [{load_module,emqx_app,brutal_purge,soft_purge,[]}, + [{load_module,emqx_channel,brutal_purge,soft_purge,[]}, + {load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {load_module,emqx_app,brutal_purge,soft_purge,[]}, {load_module,emqx_plugins,brutal_purge,soft_purge,[]}]}, {"4.3.17", - [{load_module,emqx_exclusive_subscription,brutal_purge,soft_purge,[]}, + [{load_module,emqx_channel,brutal_purge,soft_purge,[]}, + {load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {load_module,emqx_exclusive_subscription,brutal_purge,soft_purge,[]}, {load_module,emqx_session,brutal_purge,soft_purge,[]}, {load_module,emqx_shared_sub,brutal_purge,soft_purge,[]}, {update,emqx_broker_sup,supervisor}, @@ -14,7 +21,8 @@ {load_module,emqx_plugins,brutal_purge,soft_purge,[]}, {load_module,emqx_access_control,brutal_purge,soft_purge,[]}]}, {"4.3.16", - [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + [{load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {load_module,emqx_session,brutal_purge,soft_purge,[]}, {load_module,emqx_shared_sub,brutal_purge,soft_purge,[]}, {update,emqx_broker_sup,supervisor}, {load_module,emqx_access_control,brutal_purge,soft_purge,[]}, @@ -30,7 +38,8 @@ {load_module,emqx_mqtt_caps,brutal_purge,soft_purge,[]}, {load_module,emqx_topic,brutal_purge,soft_purge,[]}]}, {"4.3.15", - [{add_module,emqx_calendar}, + [{load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {add_module,emqx_calendar}, {load_module,emqx_topic,brutal_purge,soft_purge,[]}, {add_module,emqx_exclusive_subscription}, {apply,{emqx_exclusive_subscription,on_add_module,[]}}, @@ -55,7 +64,8 @@ {update,emqx_os_mon,{advanced,[]}}, {load_module,emqx_app,brutal_purge,soft_purge,[]}]}, {"4.3.14", - [{add_module,emqx_calendar}, + [{load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {add_module,emqx_calendar}, {load_module,emqx_topic,brutal_purge,soft_purge,[]}, {add_module,emqx_exclusive_subscription}, {apply,{emqx_exclusive_subscription,on_add_module,[]}}, @@ -682,12 +692,19 @@ {load_module,emqx_message,brutal_purge,soft_purge,[]}, {load_module,emqx_limiter,brutal_purge,soft_purge,[]}]}, {<<".*">>,[]}], - [ + [{"4.3.19", + [{load_module,emqx_plugins,brutal_purge,soft_purge,[]}, + {load_module,emqx_channel,brutal_purge,soft_purge,[]}, + {load_module,emqx_cm,brutal_purge,soft_purge,[]}]}, {"4.3.18", - [{load_module,emqx_app,brutal_purge,soft_purge,[]}, - {load_module,emqx_plugins,brutal_purge,soft_purge,[]}]}, + [{load_module,emqx_channel,brutal_purge,soft_purge,[]}, + {load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {load_module,emqx_app,brutal_purge,soft_purge,[]}, + {load_module,emqx_plugins,brutal_purge,soft_purge,[]}]}, {"4.3.17", - [{load_module,emqx_exclusive_subscription,brutal_purge,soft_purge,[]}, + [{load_module,emqx_channel,brutal_purge,soft_purge,[]}, + {load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {load_module,emqx_exclusive_subscription,brutal_purge,soft_purge,[]}, {load_module,emqx_session,brutal_purge,soft_purge,[]}, {load_module,emqx_shared_sub,brutal_purge,soft_purge,[]}, {update,emqx_broker_sup,supervisor}, @@ -695,7 +712,8 @@ {load_module,emqx_plugins,brutal_purge,soft_purge,[]}, {load_module,emqx_access_control,brutal_purge,soft_purge,[]}]}, {"4.3.16", - [{load_module,emqx_session,brutal_purge,soft_purge,[]}, + [{load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {load_module,emqx_session,brutal_purge,soft_purge,[]}, {load_module,emqx_shared_sub,brutal_purge,soft_purge,[]}, {update,emqx_broker_sup,supervisor}, {load_module,emqx_access_control,brutal_purge,soft_purge,[]}, @@ -711,7 +729,8 @@ {apply,{emqx_exclusive_subscription,on_delete_module,[]}}, {delete_module,emqx_exclusive_subscription}]}, {"4.3.15", - [{delete_module,emqx_calendar}, + [{load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {delete_module,emqx_calendar}, {apply,{emqx_exclusive_subscription,on_delete_module,[]}}, {delete_module,emqx_exclusive_subscription}, {load_module,emqx_topic,brutal_purge,soft_purge,[]}, @@ -735,7 +754,8 @@ {load_module,emqx_os_mon,brutal_purge,soft_purge,[]}, {load_module,emqx_app,brutal_purge,soft_purge,[]}]}, {"4.3.14", - [{delete_module,emqx_calendar}, + [{load_module,emqx_cm,brutal_purge,soft_purge,[]}, + {delete_module,emqx_calendar}, {apply,{emqx_exclusive_subscription,on_delete_module,[]}}, {delete_module,emqx_exclusive_subscription}, {load_module,emqx_topic,brutal_purge,soft_purge,[]}, diff --git a/src/emqx_channel.erl b/src/emqx_channel.erl index bf292f10b..852c171fe 100644 --- a/src/emqx_channel.erl +++ b/src/emqx_channel.erl @@ -22,6 +22,8 @@ -include("logger.hrl"). -include("types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + -logger_header("[Channel]"). -ifdef(TEST). @@ -850,11 +852,14 @@ handle_out(disconnect, ReasonCode, Channel) when is_integer(ReasonCode) -> ReasonName = disconnect_reason(ReasonCode), handle_out(disconnect, {ReasonCode, ReasonName}, Channel); -handle_out(disconnect, {ReasonCode, ReasonName}, Channel = ?IS_MQTT_V5) -> - Packet = ?DISCONNECT_PACKET(ReasonCode), +handle_out(disconnect, {ReasonCode, ReasonName}, Channel) -> + handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel); + +handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) -> + Packet = ?DISCONNECT_PACKET(ReasonCode, Props), {ok, [{outgoing, Packet}, {close, ReasonName}], Channel}; -handle_out(disconnect, {_ReasonCode, ReasonName}, Channel) -> +handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) -> {ok, {close, ReasonName}, Channel}; handle_out(auth, {ReasonCode, Properties}, Channel) -> @@ -954,11 +959,15 @@ handle_call({takeover, 'begin'}, Channel = #channel{session = Session}) -> reply(Session, Channel#channel{takeover = true}); handle_call({takeover, 'end'}, Channel = #channel{session = Session, - pendings = Pendings}) -> + pendings = Pendings, + conninfo = #{clientid := ClientId}}) -> ok = emqx_session:takeover(Session), %% TODO: Should not drain deliver here (side effect) Delivers = emqx_misc:drain_deliver(), AllPendings = lists:append(Delivers, Pendings), + ?tp(debug, + emqx_channel_takeover_end, + #{clientid => ClientId}), disconnect_and_shutdown(takeovered, AllPendings, Channel); handle_call(list_acl_cache, Channel) -> @@ -1019,6 +1028,9 @@ handle_info(clean_acl_cache, Channel) -> ok = emqx_acl_cache:empty_acl_cache(), {ok, Channel}; +handle_info({disconnect, ReasonCode, ReasonName, Props}, Channel) -> + handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel); + handle_info(Info, Channel) -> ?LOG(error, "Unexpected info: ~p", [Info]), {ok, Channel}. diff --git a/src/emqx_cm.erl b/src/emqx_cm.erl index 09a2ac9a0..09169b160 100644 --- a/src/emqx_cm.erl +++ b/src/emqx_cm.erl @@ -22,6 +22,8 @@ -include("emqx.hrl"). -include("logger.hrl"). -include("types.hrl"). +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("stdlib/include/ms_transform.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). -logger_header("[CM]"). @@ -60,7 +62,9 @@ , lookup_channels/2 ]). --export([all_channels/0]). +-export([all_channels/0, + channel_with_session_table/0, + live_connection_table/0]). %% gen_server callbacks -export([ init/1 @@ -149,8 +153,11 @@ connection_closed(ClientId) -> connection_closed(ClientId, self()). -spec(connection_closed(emqx_types:clientid(), chan_pid()) -> true). -connection_closed(ClientId, ChanPid) -> - ets:delete_object(?CHAN_CONN_TAB, {ClientId, ChanPid}). +connection_closed(_ClientId, _ChanPid) -> + %% We can't clean CHAN_CONN_TAB because records for dead connections + %% are required for `get_chann_conn_mod/1` function, and `get_chann_conn_mod/1` + %% is used for takeover. + true. %% @doc Get info of a channel. -spec(get_chan_info(emqx_types:clientid()) -> maybe(emqx_types:infos())). @@ -425,6 +432,38 @@ all_channels() -> Pat = [{{'_', '$1'}, [], ['$1']}], ets:select(?CHAN_TAB, Pat). +%% @doc Get clientinfo for all clients with sessions +channel_with_session_table() -> + Ms = ets:fun2ms( + fun({{ClientId, _ChanPid}, + Info, + _Stats}) -> + {ClientId, Info} + end), + Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]), + qlc:q([ {ClientId, ConnState, ConnInfo, ClientInfo} + || {ClientId, + #{conn_state := ConnState, + clientinfo := ClientInfo, + conninfo := #{clean_start := false} = ConnInfo}} <- Table + ]). + +%% @doc Get all local connection query handle +live_connection_table() -> + Ms = ets:fun2ms( + fun({{ClientId, ChanPid}, _}) -> + {ClientId, ChanPid} + end), + Table = ets:table(?CHAN_CONN_TAB, [{traverse, {select, Ms}}]), + qlc:q([{ClientId, ChanPid} || {ClientId, ChanPid} <- Table, is_channel_connected(ClientId, ChanPid)]). + +is_channel_connected(ClientId, ChanPid) when node(ChanPid) =:= node() -> + case get_chan_info(ClientId, ChanPid) of + #{conn_state := disconnected} -> false; + _ -> true + end; +is_channel_connected(_ClientId, _ChanPid) -> false. + %% @doc Lookup channels. -spec(lookup_channels(emqx_types:clientid()) -> list(chan_pid())). lookup_channels(ClientId) -> @@ -523,4 +562,3 @@ get_chann_conn_mod(ClientId, ChanPid) when node(ChanPid) == node() -> end; get_chann_conn_mod(ClientId, ChanPid) -> rpc_call(node(ChanPid), get_chann_conn_mod, [ClientId, ChanPid], ?T_GET_INFO). - diff --git a/src/emqx_plugins.erl b/src/emqx_plugins.erl index 764778b74..4088988c2 100644 --- a/src/emqx_plugins.erl +++ b/src/emqx_plugins.erl @@ -225,6 +225,8 @@ ensure_file(File) -> , {emqx_telemetry, true} , {emqx_rule_engine, true} , {emqx_bridge_mqtt, false} + , {emqx_eviction_agent, true} + , {emqx_node_rebalance, true} ], write_loaded(DefaultPlugins); true -> diff --git a/test/emqx_node_helpers.erl b/test/emqx_node_helpers.erl new file mode 100644 index 000000000..16d4be000 --- /dev/null +++ b/test/emqx_node_helpers.erl @@ -0,0 +1,83 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-module(emqx_node_helpers). + +-include_lib("eunit/include/eunit.hrl"). + +-define(SLAVE_START_APPS, [emqx]). + +-export([start_slave/1, + start_slave/2, + stop_slave/1]). + +start_slave(Name) -> + start_slave(Name, #{}). + +start_slave(Name, Opts) -> + {ok, Node} = ct_slave:start(list_to_atom(atom_to_list(Name) ++ "@" ++ host()), + [{kill_if_fail, true}, + {monitor_master, true}, + {init_timeout, 10000}, + {startup_timeout, 10000}, + {erl_flags, ebin_path()}]), + + pong = net_adm:ping(Node), + setup_node(Node, Opts), + Node. + +stop_slave(Node) -> + rpc:call(Node, ekka, leave, []), + ct_slave:stop(Node). + +host() -> + [_, Host] = string:tokens(atom_to_list(node()), "@"), Host. + +ebin_path() -> + string:join(["-pa" | lists:filter(fun is_lib/1, code:get_path())], " "). + +is_lib(Path) -> + string:prefix(Path, code:lib_dir()) =:= nomatch. + +setup_node(Node, #{} = Opts) -> + Listeners = maps:get(listeners, Opts, []), + StartApps = maps:get(start_apps, Opts, ?SLAVE_START_APPS), + DefaultEnvHandler = + fun(emqx) -> + application:set_env( + emqx, + listeners, + Listeners), + application:set_env(gen_rpc, port_discovery, stateless), + ok; + (_) -> + ok + end, + EnvHandler = maps:get(env_handler, Opts, DefaultEnvHandler), + + [ok = rpc:call(Node, application, load, [App]) || App <- [gen_rpc, emqx]], + ok = rpc:call(Node, emqx_ct_helpers, start_apps, [StartApps, EnvHandler]), + + rpc:call(Node, ekka, join, [node()]), + + %% Sanity check. Assert that `gen_rpc' is set up correctly: + ?assertEqual( Node + , gen_rpc:call(Node, erlang, node, []) + ), + ?assertEqual( node() + , gen_rpc:call(Node, gen_rpc, call, [node(), erlang, node, []]) + ), + ok. diff --git a/test/emqx_plugins_SUITE.erl b/test/emqx_plugins_SUITE.erl index 66a88a047..b95030889 100644 --- a/test/emqx_plugins_SUITE.erl +++ b/test/emqx_plugins_SUITE.erl @@ -99,8 +99,10 @@ t_ensure_default_loaded_plugins_file(Config) -> ?assertEqual( [ {emqx_bridge_mqtt, false} , {emqx_dashboard, true} + , {emqx_eviction_agent, true} , {emqx_management, true} , {emqx_modules, true} + , {emqx_node_rebalance, true} , {emqx_recon, true} , {emqx_retainer, true} , {emqx_rule_engine, true} diff --git a/test/emqx_shared_sub_SUITE.erl b/test/emqx_shared_sub_SUITE.erl index 02c4c6598..0eb63dbf0 100644 --- a/test/emqx_shared_sub_SUITE.erl +++ b/test/emqx_shared_sub_SUITE.erl @@ -380,7 +380,7 @@ t_local(_) -> emqtt:stop(ConnPid1), emqtt:stop(ConnPid2), - stop_slave(Node), + emqx_node_helpers:stop_slave(Node), ?assertEqual(local, emqx_shared_sub:strategy(<<"local_group">>)), ?assertEqual(local, RemoteLocalGroupStrategy), @@ -415,7 +415,7 @@ t_local_fallback(_) -> {true, UsedSubPid2} = last_message(<<"hello2">>, [ConnPid1]), emqtt:stop(ConnPid1), - stop_slave(Node), + emqx_node_helpers:stop_slave(Node), ?assertEqual(UsedSubPid1, UsedSubPid2), ok. @@ -536,55 +536,8 @@ recv_msgs(Count, Msgs) -> end. start_slave(Name, Port) -> - {ok, Node} = ct_slave:start(list_to_atom(atom_to_list(Name) ++ "@" ++ host()), - [{kill_if_fail, true}, - {monitor_master, true}, - {init_timeout, 10000}, - {startup_timeout, 10000}, - {erl_flags, ebin_path()}]), - - pong = net_adm:ping(Node), - ok = setup_node(Node, Port), - Node. - -stop_slave(Node) -> - rpc:call(Node, ekka, leave, []), - ct_slave:stop(Node). - -host() -> - [_, Host] = string:tokens(atom_to_list(node()), "@"), Host. - -ebin_path() -> - string:join(["-pa" | lists:filter(fun is_lib/1, code:get_path())], " "). - -is_lib(Path) -> - string:prefix(Path, code:lib_dir()) =:= nomatch. - -setup_node(Node, Port) -> - EnvHandler = - fun(emqx) -> - application:set_env( - emqx, - listeners, - [#{listen_on => {{127,0,0,1},Port}, - name => "internal", - opts => [{zone,internal}], - proto => tcp}]), - application:set_env(gen_rpc, port_discovery, stateless), - ok; - (_) -> - ok - end, - - [ok = rpc:call(Node, application, load, [App]) || App <- [gen_rpc, emqx]], - ok = rpc:call(Node, emqx_ct_helpers, start_apps, [[emqx], EnvHandler]), - rpc:call(Node, ekka, join, [node()]), - - %% Sanity check. Assert that `gen_rpc' is set up correctly: - ?assertEqual( Node - , gen_rpc:call(Node, erlang, node, []) - ), - ?assertEqual( node() - , gen_rpc:call(Node, gen_rpc, call, [node(), erlang, node, []]) - ), - ok. + Listeners = [#{listen_on => {{127,0,0,1}, Port}, + name => "internal", + opts => [{zone,internal}], + proto => tcp}], + emqx_node_helpers:start_slave(Name, #{listeners => Listeners}). From 9435b6aa8227f7744630c0bfdb704f05687be655 Mon Sep 17 00:00:00 2001 From: "Zaiming (Stone) Shi" Date: Sat, 13 Aug 2022 14:07:55 +0200 Subject: [PATCH 3/3] docs: Update CHANGES-4.3.md --- CHANGES-4.3.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGES-4.3.md b/CHANGES-4.3.md index 10304fc70..ab74b877a 100644 --- a/CHANGES-4.3.md +++ b/CHANGES-4.3.md @@ -56,7 +56,6 @@ File format: - HTTP API(GET /rules/) support for pagination and fuzzy filtering. [#8450] - Add check_conf cli to check config format. [#8486] - Optimize performance of shared subscription -- Make possible to debug-print SSL handshake procedure by setting listener config `log_level=debug` [#8553](https://github.com/emqx/emqx/pull/8553) ## v4.3.16