From 609f7bd8fd35b1d76086d7f822deb4e1260b6e47 Mon Sep 17 00:00:00 2001 From: Ilya Averyanov Date: Fri, 17 Feb 2023 00:16:29 +0200 Subject: [PATCH] feat(rebalance): port apps from 4.x --- Makefile | 1 + apps/emqx/include/asserts.hrl | 31 + apps/emqx/include/emqx_channel.hrl | 42 + apps/emqx/include/emqx_hooks.hrl | 1 + apps/emqx/priv/bpapi.versions | 5 + apps/emqx/src/emqx_channel.erl | 54 +- apps/emqx/src/emqx_cm.erl | 49 +- apps/emqx/src/emqx_router_helper.erl | 12 +- .../test/emqx_dashboard_api_test_helpers.erl | 6 +- apps/emqx_machine/src/emqx_machine_boot.erl | 10 +- changes/ee/feat-10075.en.md | 1 + changes/ee/feat-10075.zh.md | 1 + lib-ee/emqx_eviction_agent/README.md | 9 + .../etc/emqx_eviction_agent.conf | 3 + .../i18n/emqx_eviction_agent_api_i18n.conf | 14 + lib-ee/emqx_eviction_agent/rebar.config | 2 + .../src/emqx_eviction_agent.app.src | 22 + .../src/emqx_eviction_agent.appup.src | 3 + .../src/emqx_eviction_agent.erl | 346 ++++++++ .../src/emqx_eviction_agent_api.erl | 85 ++ .../src/emqx_eviction_agent_app.erl | 24 + .../src/emqx_eviction_agent_channel.erl | 368 +++++++++ .../src/emqx_eviction_agent_cli.erl | 30 + .../src/emqx_eviction_agent_conn_sup.erl | 21 + .../src/emqx_eviction_agent_sup.erl | 34 + .../proto/emqx_eviction_agent_proto_v1.erl | 27 + .../test/emqx_eviction_agent_SUITE.erl | 403 ++++++++++ .../test/emqx_eviction_agent_api_SUITE.erl | 69 ++ .../emqx_eviction_agent_channel_SUITE.erl | 251 ++++++ .../test/emqx_eviction_agent_cli_SUITE.erl | 39 + .../test/emqx_eviction_agent_test_helpers.erl | 141 ++++ lib-ee/emqx_node_rebalance/README.md | 9 + .../etc/emqx_node_rebalance.conf | 3 + .../i18n/emqx_node_rebalance_api_i18n.conf | 490 ++++++++++++ .../include/emqx_node_rebalance.hrl | 33 + lib-ee/emqx_node_rebalance/rebar.config | 2 + .../src/emqx_node_rebalance.app.src | 22 + .../src/emqx_node_rebalance.appup.src | 3 + .../src/emqx_node_rebalance.erl | 438 +++++++++++ .../src/emqx_node_rebalance_agent.erl | 131 ++++ .../src/emqx_node_rebalance_api.erl | 738 ++++++++++++++++++ .../src/emqx_node_rebalance_app.erl | 22 + .../src/emqx_node_rebalance_cli.erl | 305 ++++++++ .../src/emqx_node_rebalance_evacuation.erl | 308 ++++++++ ...emqx_node_rebalance_evacuation_persist.erl | 120 +++ .../src/emqx_node_rebalance_status.erl | 238 ++++++ .../src/emqx_node_rebalance_sup.erl | 35 + .../emqx_node_rebalance_api_proto_v1.erl | 43 + ...mqx_node_rebalance_evacuation_proto_v1.erl | 22 + .../proto/emqx_node_rebalance_proto_v1.erl | 62 ++ .../emqx_node_rebalance_status_proto_v1.erl | 36 + .../test/emqx_node_rebalance_SUITE.erl | 229 ++++++ .../test/emqx_node_rebalance_agent_SUITE.erl | 214 +++++ .../test/emqx_node_rebalance_api_SUITE.erl | 444 +++++++++++ .../test/emqx_node_rebalance_cli_SUITE.erl | 291 +++++++ .../emqx_node_rebalance_evacuation_SUITE.erl | 271 +++++++ ...ode_rebalance_evacuation_persist_SUITE.erl | 108 +++ mix.exs | 4 +- rebar.config.erl | 4 +- 59 files changed, 6686 insertions(+), 43 deletions(-) create mode 100644 apps/emqx/include/asserts.hrl create mode 100644 apps/emqx/include/emqx_channel.hrl create mode 100644 changes/ee/feat-10075.en.md create mode 100644 changes/ee/feat-10075.zh.md create mode 100644 lib-ee/emqx_eviction_agent/README.md create mode 100644 lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf create mode 100644 lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf create mode 100644 lib-ee/emqx_eviction_agent/rebar.config create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl create mode 100644 lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl create mode 100644 lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl create mode 100644 lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl create mode 100644 lib-ee/emqx_node_rebalance/README.md create mode 100644 lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf create mode 100644 lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf create mode 100644 lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl create mode 100644 lib-ee/emqx_node_rebalance/rebar.config create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl create mode 100644 lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl create mode 100644 lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl diff --git a/Makefile b/Makefile index 6741317ee..24af58ebc 100644 --- a/Makefile +++ b/Makefile @@ -179,6 +179,7 @@ clean-all: @rm -f rebar.lock @rm -rf deps @rm -rf _build + @rm -f emqx_dialyzer_*_plt .PHONY: deps-all deps-all: $(REBAR) $(PROFILES:%=deps-%) diff --git a/apps/emqx/include/asserts.hrl b/apps/emqx/include/asserts.hrl new file mode 100644 index 000000000..98d8e72fc --- /dev/null +++ b/apps/emqx/include/asserts.hrl @@ -0,0 +1,31 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +%% This file contains common macros for testing. +%% It must not be used anywhere except in test suites. + +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-define(assertWaitEvent(Code, EventMatch, Timeout), + ?assertMatch( + {_, {ok, EventMatch}}, + ?wait_async_action( + Code, + EventMatch, + Timeout + ) + ) +). diff --git a/apps/emqx/include/emqx_channel.hrl b/apps/emqx/include/emqx_channel.hrl new file mode 100644 index 000000000..d4362633a --- /dev/null +++ b/apps/emqx/include/emqx_channel.hrl @@ -0,0 +1,42 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2017-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-define(CHANNEL_METRICS, [ + recv_pkt, + recv_msg, + 'recv_msg.qos0', + 'recv_msg.qos1', + 'recv_msg.qos2', + 'recv_msg.dropped', + 'recv_msg.dropped.await_pubrel_timeout', + send_pkt, + send_msg, + 'send_msg.qos0', + 'send_msg.qos1', + 'send_msg.qos2', + 'send_msg.dropped', + 'send_msg.dropped.expired', + 'send_msg.dropped.queue_full', + 'send_msg.dropped.too_large' +]). + +-define(INFO_KEYS, [ + conninfo, + conn_state, + clientinfo, + session, + will_msg +]). diff --git a/apps/emqx/include/emqx_hooks.hrl b/apps/emqx/include/emqx_hooks.hrl index 1665492c5..2373b5928 100644 --- a/apps/emqx/include/emqx_hooks.hrl +++ b/apps/emqx/include/emqx_hooks.hrl @@ -34,6 +34,7 @@ -define(HP_BRIDGE, 870). -define(HP_DELAY_PUB, 860). %% apps that can stop the hooks chain from continuing +-define(HP_NODE_REBALANCE, 110). -define(HP_EXHOOK, 100). %% == Lowest Priority = 0, don't change this value as the plugins may depend on it. diff --git a/apps/emqx/priv/bpapi.versions b/apps/emqx/priv/bpapi.versions index db4765e3f..dceb38c47 100644 --- a/apps/emqx/priv/bpapi.versions +++ b/apps/emqx/priv/bpapi.versions @@ -13,6 +13,7 @@ {emqx_conf,2}. {emqx_dashboard,1}. {emqx_delayed,1}. +{emqx_eviction_agent,1}. {emqx_exhook,1}. {emqx_gateway_api_listeners,1}. {emqx_gateway_cm,1}. @@ -26,6 +27,10 @@ {emqx_mgmt_cluster,1}. {emqx_mgmt_trace,1}. {emqx_mgmt_trace,2}. +{emqx_node_rebalance,1}. +{emqx_node_rebalance_api,1}. +{emqx_node_rebalance_evacuation,1}. +{emqx_node_rebalance_status,1}. {emqx_persistent_session,1}. {emqx_plugin_libs,1}. {emqx_plugins,1}. diff --git a/apps/emqx/src/emqx_channel.erl b/apps/emqx/src/emqx_channel.erl index 862b72c06..69e0a55f7 100644 --- a/apps/emqx/src/emqx_channel.erl +++ b/apps/emqx/src/emqx_channel.erl @@ -18,6 +18,7 @@ -module(emqx_channel). -include("emqx.hrl"). +-include("emqx_channel.hrl"). -include("emqx_mqtt.hrl"). -include("logger.hrl"). -include("types.hrl"). @@ -57,6 +58,12 @@ clear_keepalive/1 ]). +%% Export for emqx_channel implementations +-export([ + maybe_nack/1, + maybe_mark_as_delivered/2 +]). + %% Exports for CT -export([set_field/3]). @@ -69,7 +76,7 @@ ] ). --export_type([channel/0, opts/0]). +-export_type([channel/0, opts/0, conn_state/0]). -record(channel, { %% MQTT ConnInfo @@ -131,33 +138,6 @@ quota_timer => expire_quota_limit }). --define(CHANNEL_METRICS, [ - recv_pkt, - recv_msg, - 'recv_msg.qos0', - 'recv_msg.qos1', - 'recv_msg.qos2', - 'recv_msg.dropped', - 'recv_msg.dropped.await_pubrel_timeout', - send_pkt, - send_msg, - 'send_msg.qos0', - 'send_msg.qos1', - 'send_msg.qos2', - 'send_msg.dropped', - 'send_msg.dropped.expired', - 'send_msg.dropped.queue_full', - 'send_msg.dropped.too_large' -]). - --define(INFO_KEYS, [ - conninfo, - conn_state, - clientinfo, - session, - will_msg -]). - -define(LIMITER_ROUTING, message_routing). -dialyzer({no_match, [shutdown/4, ensure_timer/2, interval/2]}). @@ -1078,10 +1058,12 @@ handle_out(unsuback, {PacketId, _ReasonCodes}, Channel) -> handle_out(disconnect, ReasonCode, Channel) when is_integer(ReasonCode) -> ReasonName = disconnect_reason(ReasonCode), handle_out(disconnect, {ReasonCode, ReasonName}, Channel); -handle_out(disconnect, {ReasonCode, ReasonName}, Channel = ?IS_MQTT_V5) -> - Packet = ?DISCONNECT_PACKET(ReasonCode), +handle_out(disconnect, {ReasonCode, ReasonName}, Channel) -> + handle_out(disconnect, {ReasonCode, ReasonName, #{}}, Channel); +handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel = ?IS_MQTT_V5) -> + Packet = ?DISCONNECT_PACKET(ReasonCode, Props), {ok, [{outgoing, Packet}, {close, ReasonName}], Channel}; -handle_out(disconnect, {_ReasonCode, ReasonName}, Channel) -> +handle_out(disconnect, {_ReasonCode, ReasonName, _Props}, Channel) -> {ok, {close, ReasonName}, Channel}; handle_out(auth, {ReasonCode, Properties}, Channel) -> {ok, ?AUTH_PACKET(ReasonCode, Properties), Channel}; @@ -1198,13 +1180,19 @@ handle_call( {takeover, 'end'}, Channel = #channel{ session = Session, - pendings = Pendings + pendings = Pendings, + conninfo = #{clientid := ClientId} } ) -> ok = emqx_session:takeover(Session), %% TODO: Should not drain deliver here (side effect) Delivers = emqx_utils:drain_deliver(), AllPendings = lists:append(Delivers, Pendings), + ?tp( + debug, + emqx_channel_takeover_end, + #{clientid => ClientId} + ), disconnect_and_shutdown(takenover, AllPendings, Channel); handle_call(list_authz_cache, Channel) -> {reply, emqx_authz_cache:list_authz_cache(), Channel}; @@ -1276,6 +1264,8 @@ handle_info(die_if_test = Info, Channel) -> die_if_test_compiled(), ?SLOG(error, #{msg => "unexpected_info", info => Info}), {ok, Channel}; +handle_info({disconnect, ReasonCode, ReasonName, Props}, Channel) -> + handle_out(disconnect, {ReasonCode, ReasonName, Props}, Channel); handle_info(Info, Channel) -> ?SLOG(error, #{msg => "unexpected_info", info => Info}), {ok, Channel}. diff --git a/apps/emqx/src/emqx_cm.erl b/apps/emqx/src/emqx_cm.erl index 0290b57d3..c8296f317 100644 --- a/apps/emqx/src/emqx_cm.erl +++ b/apps/emqx/src/emqx_cm.erl @@ -23,6 +23,8 @@ -include("logger.hrl"). -include("types.hrl"). -include_lib("snabbkaffe/include/snabbkaffe.hrl"). +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("stdlib/include/ms_transform.hrl"). -export([start_link/0]). @@ -72,6 +74,12 @@ get_session_confs/2 ]). +%% Client management +-export([ + channel_with_session_table/1, + live_connection_table/1 +]). + %% gen_server callbacks -export([ init/1, @@ -593,6 +601,40 @@ all_channels() -> Pat = [{{'_', '$1'}, [], ['$1']}], ets:select(?CHAN_TAB, Pat). +%% @doc Get clientinfo for all clients with sessions +channel_with_session_table(ConnModules) -> + Ms = ets:fun2ms( + fun({{ClientId, _ChanPid}, Info, _Stats}) -> + {ClientId, Info} + end + ), + Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]), + ConnModuleMap = maps:from_list([{Mod, true} || Mod <- ConnModules]), + qlc:q([ + {ClientId, ConnState, ConnInfo, ClientInfo} + || {ClientId, #{ + conn_state := ConnState, + clientinfo := ClientInfo, + conninfo := #{clean_start := false, conn_mod := ConnModule} = ConnInfo + }} <- + Table, + maps:is_key(ConnModule, ConnModuleMap) + ]). + +%% @doc Get all local connection query handle +live_connection_table(ConnModules) -> + Ms = lists:map(fun live_connection_ms/1, ConnModules), + Table = ets:table(?CHAN_CONN_TAB, [{traverse, {select, Ms}}]), + qlc:q([{ClientId, ChanPid} || {ClientId, ChanPid} <- Table, is_channel_connected(ChanPid)]). + +live_connection_ms(ConnModule) -> + {{{'$1', '$2'}, ConnModule}, [], [{{'$1', '$2'}}]}. + +is_channel_connected(ChanPid) when node(ChanPid) =:= node() -> + ets:member(?CHAN_LIVE_TAB, ChanPid); +is_channel_connected(_ChanPid) -> + false. + %% @doc Get all registered clientIDs. Debug/test interface all_client_ids() -> Pat = [{{'$1', '_'}, [], ['$1']}], @@ -693,7 +735,8 @@ code_change(_OldVsn, State, _Extra) -> %%-------------------------------------------------------------------- clean_down({ChanPid, ClientId}) -> - do_unregister_channel({ClientId, ChanPid}). + do_unregister_channel({ClientId, ChanPid}), + ok = ?tp(debug, emqx_cm_clean_down, #{client_id => ClientId}). stats_fun() -> lists:foreach(fun update_stats/1, ?CHAN_STATS). @@ -719,12 +762,12 @@ get_chann_conn_mod(ClientId, ChanPid) -> wrap_rpc(emqx_cm_proto_v1:get_chann_conn_mod(ClientId, ChanPid)). mark_channel_connected(ChanPid) -> - ?tp(emqx_cm_connected_client_count_inc, #{}), + ?tp(emqx_cm_connected_client_count_inc, #{chan_pid => ChanPid}), ets:insert_new(?CHAN_LIVE_TAB, {ChanPid, true}), ok. mark_channel_disconnected(ChanPid) -> - ?tp(emqx_cm_connected_client_count_dec, #{}), + ?tp(emqx_cm_connected_client_count_dec, #{chan_pid => ChanPid}), ets:delete(?CHAN_LIVE_TAB, ChanPid), ok. diff --git a/apps/emqx/src/emqx_router_helper.erl b/apps/emqx/src/emqx_router_helper.erl index e2d54b99e..4bff98072 100644 --- a/apps/emqx/src/emqx_router_helper.erl +++ b/apps/emqx/src/emqx_router_helper.erl @@ -167,9 +167,15 @@ handle_info(Info, State) -> {noreply, State}. terminate(_Reason, _State) -> - ok = ekka:unmonitor(membership), - emqx_stats:cancel_update(route_stats), - mnesia:unsubscribe({table, ?ROUTING_NODE, simple}). + try + ok = ekka:unmonitor(membership), + emqx_stats:cancel_update(route_stats), + mnesia:unsubscribe({table, ?ROUTING_NODE, simple}) + catch + exit:{noproc, {gen_server, call, [mria_membership, _]}} -> + ?SLOG(warning, #{msg => "mria_membership_down"}), + ok + end. code_change(_OldVsn, State, _Extra) -> {ok, State}. diff --git a/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl b/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl index 91c7729d3..25b4065de 100644 --- a/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl +++ b/apps/emqx_dashboard/test/emqx_dashboard_api_test_helpers.erl @@ -20,6 +20,7 @@ set_default_config/0, set_default_config/1, set_default_config/2, + set_default_config/3, request/2, request/3, request/4, @@ -40,11 +41,14 @@ set_default_config(DefaultUsername) -> set_default_config(DefaultUsername, false). set_default_config(DefaultUsername, HAProxyEnabled) -> + set_default_config(DefaultUsername, HAProxyEnabled, #{}). + +set_default_config(DefaultUsername, HAProxyEnabled, Opts) -> Config = #{ listeners => #{ http => #{ enable => true, - bind => 18083, + bind => maps:get(bind, Opts, 18083), inet6 => false, ipv6_v6only => false, max_connections => 512, diff --git a/apps/emqx_machine/src/emqx_machine_boot.erl b/apps/emqx_machine/src/emqx_machine_boot.erl index 82b3d602f..e3f84079b 100644 --- a/apps/emqx_machine/src/emqx_machine_boot.erl +++ b/apps/emqx_machine/src/emqx_machine_boot.erl @@ -149,8 +149,14 @@ basic_reboot_apps() -> emqx_plugins ], case emqx_release:edition() of - ce -> CE; - ee -> CE ++ [] + ce -> + CE; + ee -> + CE ++ + [ + emqx_eviction_agent, + emqx_node_rebalance + ] end. sorted_reboot_apps() -> diff --git a/changes/ee/feat-10075.en.md b/changes/ee/feat-10075.en.md new file mode 100644 index 000000000..e6e070ddc --- /dev/null +++ b/changes/ee/feat-10075.en.md @@ -0,0 +1 @@ +Add node rebalance/node evacuation [functionality](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md). diff --git a/changes/ee/feat-10075.zh.md b/changes/ee/feat-10075.zh.md new file mode 100644 index 000000000..36c78acb8 --- /dev/null +++ b/changes/ee/feat-10075.zh.md @@ -0,0 +1 @@ +添加节点再平衡/节点疏散[功能](https://github.com/emqx/eip/blob/main/active/0020-node-rebalance.md)。 diff --git a/lib-ee/emqx_eviction_agent/README.md b/lib-ee/emqx_eviction_agent/README.md new file mode 100644 index 000000000..f9b8037bf --- /dev/null +++ b/lib-ee/emqx_eviction_agent/README.md @@ -0,0 +1,9 @@ +emqx_eviction_agent +===== + +An OTP library + +Build +----- + + $ rebar3 compile diff --git a/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf b/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf new file mode 100644 index 000000000..011b7fb0f --- /dev/null +++ b/lib-ee/emqx_eviction_agent/etc/emqx_eviction_agent.conf @@ -0,0 +1,3 @@ +##-------------------------------------------------------------------- +## EMQX Eviction Agent Plugin +##-------------------------------------------------------------------- diff --git a/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf b/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf new file mode 100644 index 000000000..8bb7282c3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/i18n/emqx_eviction_agent_api_i18n.conf @@ -0,0 +1,14 @@ +emqx_eviction_agent_api { + + node_eviction_status_get { + desc { + en: "Get the node eviction status" + zh: "获取节点驱逐状态" + } + label { + en: "Node Eviction Status" + zh: "节点驱逐状态" + } + } + +} diff --git a/lib-ee/emqx_eviction_agent/rebar.config b/lib-ee/emqx_eviction_agent/rebar.config new file mode 100644 index 000000000..b055d8f4f --- /dev/null +++ b/lib-ee/emqx_eviction_agent/rebar.config @@ -0,0 +1,2 @@ +{deps, [{emqx, {path, "../../apps/emqx"}}]}. +{project_plugins, [erlfmt]}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src new file mode 100644 index 000000000..a360133f4 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.app.src @@ -0,0 +1,22 @@ +{application, emqx_eviction_agent, [ + {description, "EMQX Eviction Agent"}, + {vsn, "5.0.0"}, + {registered, [ + emqx_eviction_agent_sup, + emqx_eviction_agent, + emqx_eviction_agent_conn_sup + ]}, + {applications, [ + kernel, + stdlib, + emqx_ctl + ]}, + {mod, {emqx_eviction_agent_app, []}}, + {env, []}, + {modules, []}, + {maintainers, ["EMQX Team "]}, + {links, [ + {"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx"} + ]} +]}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src new file mode 100644 index 000000000..c1b84778d --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.appup.src @@ -0,0 +1,3 @@ +%% -*- mode: erlang -*- +%% Unless you know what you are doing, DO NOT edit manually!! +{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl new file mode 100644 index 000000000..b8e1b5236 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent.erl @@ -0,0 +1,346 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("emqx/include/emqx_hooks.hrl"). + +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start_link/0, + enable/2, + disable/1, + status/0, + connection_count/0, + session_count/0, + session_count/1, + evict_connections/1, + evict_sessions/2, + evict_sessions/3, + evict_session_channel/3 +]). + +-behaviour(gen_server). + +-export([ + init/1, + handle_call/3, + handle_info/2, + handle_cast/2, + code_change/3 +]). + +-export([ + on_connect/2, + on_connack/3 +]). + +-export([ + hook/0, + unhook/0 +]). + +-export_type([server_reference/0]). + +-define(CONN_MODULES, [emqx_connection, emqx_ws_connection, emqx_eviction_agent_channel]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type server_reference() :: binary() | undefined. +-type status() :: {enabled, conn_stats()} | disabled. +-type conn_stats() :: #{ + connections := non_neg_integer(), + sessions := non_neg_integer() +}. +-type kind() :: atom(). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec enable(kind(), server_reference()) -> ok_or_error(eviction_agent_busy). +enable(Kind, ServerReference) -> + gen_server:call(?MODULE, {enable, Kind, ServerReference}). + +-spec disable(kind()) -> ok. +disable(Kind) -> + gen_server:call(?MODULE, {disable, Kind}). + +-spec status() -> status(). +status() -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + {enabled, stats()}; + disabled -> + disabled + end. + +-spec evict_connections(pos_integer()) -> ok_or_error(disabled). +evict_connections(N) -> + case enable_status() of + {enabled, _Kind, ServerReference} -> + ok = do_evict_connections(N, ServerReference); + disabled -> + {error, disabled} + end. + +-spec evict_sessions(pos_integer(), node() | [node()]) -> ok_or_error(disabled). +evict_sessions(N, Node) when is_atom(Node) -> + evict_sessions(N, [Node]); +evict_sessions(N, Nodes) when is_list(Nodes) andalso length(Nodes) > 0 -> + evict_sessions(N, Nodes, any). + +-spec evict_sessions(pos_integer(), node() | [node()], atom()) -> ok_or_error(disabled). +evict_sessions(N, Node, ConnState) when is_atom(Node) -> + evict_sessions(N, [Node], ConnState); +evict_sessions(N, Nodes, ConnState) when + is_list(Nodes) andalso length(Nodes) > 0 +-> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + ok = do_evict_sessions(N, Nodes, ConnState); + disabled -> + {error, disabled} + end. + +%%-------------------------------------------------------------------- +%% gen_server callbacks +%%-------------------------------------------------------------------- + +init([]) -> + _ = persistent_term:erase(?MODULE), + {ok, #{}}. + +%% enable +handle_call({enable, Kind, ServerReference}, _From, St) -> + Reply = + case enable_status() of + disabled -> + ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference}); + {enabled, Kind, _ServerReference} -> + ok = persistent_term:put(?MODULE, {enabled, Kind, ServerReference}); + {enabled, _OtherKind, _ServerReference} -> + {error, eviction_agent_busy} + end, + {reply, Reply, St}; +%% disable +handle_call({disable, Kind}, _From, St) -> + Reply = + case enable_status() of + disabled -> + {error, disabled}; + {enabled, Kind, _ServerReference} -> + _ = persistent_term:erase(?MODULE), + ok; + {enabled, _OtherKind, _ServerReference} -> + {error, eviction_agent_busy} + end, + {reply, Reply, St}; +handle_call(Msg, _From, St) -> + ?SLOG(warning, #{msg => "unknown_call", call => Msg, state => St}), + {reply, {error, unknown_call}, St}. + +handle_info(Msg, St) -> + ?SLOG(warning, #{msg => "unknown_msg", info => Msg, state => St}), + {noreply, St}. + +handle_cast(Msg, St) -> + ?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => St}), + {noreply, St}. + +code_change(_Vsn, State, _Extra) -> + {ok, State}. + +%%-------------------------------------------------------------------- +%% Hook callbacks +%%-------------------------------------------------------------------- + +on_connect(_ConnInfo, _Props) -> + case enable_status() of + {enabled, _Kind, _ServerReference} -> + {stop, {error, ?RC_USE_ANOTHER_SERVER}}; + disabled -> + ignore + end. + +on_connack( + #{proto_name := <<"MQTT">>, proto_ver := ?MQTT_PROTO_V5}, + use_another_server, + Props +) -> + case enable_status() of + {enabled, _Kind, ServerReference} -> + {ok, Props#{'Server-Reference' => ServerReference}}; + disabled -> + {ok, Props} + end; +on_connack(_ClientInfo, _Reason, Props) -> + {ok, Props}. + +%%-------------------------------------------------------------------- +%% Hook funcs +%%-------------------------------------------------------------------- + +hook() -> + ?tp(debug, eviction_agent_hook, #{}), + ok = emqx_hooks:put('client.connack', {?MODULE, on_connack, []}, ?HP_NODE_REBALANCE), + ok = emqx_hooks:put('client.connect', {?MODULE, on_connect, []}, ?HP_NODE_REBALANCE). + +unhook() -> + ?tp(debug, eviction_agent_unhook, #{}), + ok = emqx_hooks:del('client.connect', {?MODULE, on_connect}), + ok = emqx_hooks:del('client.connack', {?MODULE, on_connack}). + +enable_status() -> + persistent_term:get(?MODULE, disabled). + +% connection management +stats() -> + #{ + connections => connection_count(), + sessions => session_count() + }. + +connection_table() -> + emqx_cm:live_connection_table(?CONN_MODULES). + +connection_count() -> + table_count(connection_table()). + +channel_with_session_table(any) -> + qlc:q([ + {ClientId, ConnInfo, ClientInfo} + || {ClientId, _, ConnInfo, ClientInfo} <- + emqx_cm:channel_with_session_table(?CONN_MODULES) + ]); +channel_with_session_table(RequiredConnState) -> + qlc:q([ + {ClientId, ConnInfo, ClientInfo} + || {ClientId, ConnState, ConnInfo, ClientInfo} <- + emqx_cm:channel_with_session_table(?CONN_MODULES), + RequiredConnState =:= ConnState + ]). + +session_count() -> + session_count(any). + +session_count(ConnState) -> + table_count(channel_with_session_table(ConnState)). + +table_count(QH) -> + qlc:fold(fun(_, Acc) -> Acc + 1 end, 0, QH). + +take_connections(N) -> + ChanQH = qlc:q([ChanPid || {_ClientId, ChanPid} <- connection_table()]), + ChanPidCursor = qlc:cursor(ChanQH), + ChanPids = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + ChanPids. + +take_channel_with_sessions(N, ConnState) -> + ChanPidCursor = qlc:cursor(channel_with_session_table(ConnState)), + Channels = qlc:next_answers(ChanPidCursor, N), + ok = qlc:delete_cursor(ChanPidCursor), + Channels. + +do_evict_connections(N, ServerReference) when N > 0 -> + ChanPids = take_connections(N), + ok = lists:foreach( + fun(ChanPid) -> + disconnect_channel(ChanPid, ServerReference) + end, + ChanPids + ). + +do_evict_sessions(N, Nodes, ConnState) when N > 0 -> + Channels = take_channel_with_sessions(N, ConnState), + ok = lists:foreach( + fun({ClientId, ConnInfo, ClientInfo}) -> + evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) + end, + Channels + ). + +evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) -> + Node = select_random(Nodes), + ?SLOG( + info, + #{ + msg => "evict_session_channel", + client_id => ClientId, + node => Node, + conn_info => ConnInfo, + client_info => ClientInfo + } + ), + case emqx_eviction_agent_proto_v1:evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) of + {badrpc, Reason} -> + ?SLOG( + error, + #{ + msg => "evict_session_channel_rpc_error", + client_id => ClientId, + node => Node, + reason => Reason + } + ), + {error, Reason}; + {error, Reason} = Error -> + ?SLOG( + error, + #{ + msg => "evict_session_channel_error", + client_id => ClientId, + node => Node, + reason => Reason + } + ), + Error; + Res -> + Res + end. + +-spec evict_session_channel( + emqx_types:clientid(), + emqx_types:conninfo(), + emqx_types:clientinfo() +) -> supervisor:startchild_ret(). +evict_session_channel(ClientId, ConnInfo, ClientInfo) -> + ?SLOG(info, #{ + msg => "evict_session_channel", + client_id => ClientId, + conn_info => ConnInfo, + client_info => ClientInfo + }), + Result = emqx_eviction_agent_channel:start_supervised( + #{ + conninfo => ConnInfo, + clientinfo => ClientInfo + } + ), + ?SLOG( + info, + #{ + msg => "evict_session_channel_result", + client_id => ClientId, + result => Result + } + ), + Result. + +disconnect_channel(ChanPid, ServerReference) -> + ChanPid ! + {disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{ + 'Server-Reference' => ServerReference + }}. + +select_random(List) when length(List) > 0 -> + lists:nth(rand:uniform(length(List)), List). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl new file mode 100644 index 000000000..d8c1d7645 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_api.erl @@ -0,0 +1,85 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_api). + +-behaviour(minirest_api). + +-include_lib("typerefl/include/types.hrl"). +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx/include/logger.hrl"). + +%% Swagger specs from hocon schema +-export([ + api_spec/0, + paths/0, + schema/1, + namespace/0 +]). + +-export([ + fields/1, + roots/0 +]). + +%% API callbacks +-export([ + '/node_eviction/status'/2 +]). + +-import(hoconsc, [mk/2, ref/1, ref/2]). + +namespace() -> "node_eviction". + +api_spec() -> + emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}). + +paths() -> + [ + "/node_eviction/status" + ]. + +schema("/node_eviction/status") -> + #{ + 'operationId' => '/node_eviction/status', + get => #{ + tags => [<<"node_eviction">>], + summary => <<"Get node eviction status">>, + description => ?DESC("node_eviction_status_get"), + responses => #{ + 200 => schema_status() + } + } + }. + +'/node_eviction/status'(_Bindings, _Params) -> + case emqx_eviction_agent:status() of + disabled -> + {200, #{status => disabled}}; + {enabled, Stats} -> + {200, #{ + status => enabled, + stats => Stats + }} + end. + +schema_status() -> + mk(hoconsc:union([ref(status_enabled), ref(status_disabled)]), #{}). + +roots() -> []. + +fields(status_enabled) -> + [ + {status, mk(enabled, #{default => enabled})}, + {stats, ref(stats)} + ]; +fields(stats) -> + [ + {connections, mk(integer(), #{})}, + {sessions, mk(integer(), #{})} + ]; +fields(status_disabled) -> + [ + {status, mk(disabled, #{default => disabled})} + ]. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl new file mode 100644 index 000000000..63af59b09 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_app.erl @@ -0,0 +1,24 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_app). + +-behaviour(application). + +-emqx_plugin(?MODULE). + +-export([ + start/2, + stop/1 +]). + +start(_Type, _Args) -> + ok = emqx_eviction_agent:hook(), + {ok, Sup} = emqx_eviction_agent_sup:start_link(), + ok = emqx_eviction_agent_cli:load(), + {ok, Sup}. + +stop(_State) -> + ok = emqx_eviction_agent:unhook(), + ok = emqx_eviction_agent_cli:unload(). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl new file mode 100644 index 000000000..a42033c0f --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_channel.erl @@ -0,0 +1,368 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +%% MQTT Channel +-module(emqx_eviction_agent_channel). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_channel.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-logger_header("[Evicted Channel]"). + +-export([ + start_link/1, + start_supervised/1, + call/2, + call/3, + cast/2, + stop/1 +]). + +-export([ + init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3 +]). + +-import( + emqx_misc, + [ + maybe_apply/2 + ] +). + +-type opts() :: #{ + conninfo := emqx_types:conninfo(), + clientinfo := emqx_types:clientinfo() +}. + +%%-------------------------------------------------------------------- +%% API +%%-------------------------------------------------------------------- + +-spec start_supervised(opts()) -> supervisor:startchild_ret(). +start_supervised(#{clientinfo := #{clientid := ClientId}} = Opts) -> + RandomId = integer_to_binary(erlang:unique_integer([positive])), + ClientIdBin = bin_clientid(ClientId), + Id = <>, + ChildSpec = #{ + id => Id, + start => {?MODULE, start_link, [Opts]}, + restart => temporary, + shutdown => 5000, + type => worker, + modules => [?MODULE] + }, + supervisor:start_child( + emqx_eviction_agent_conn_sup, + ChildSpec + ). + +-spec start_link(opts()) -> startlink_ret(). +start_link(Opts) -> + gen_server:start_link(?MODULE, [Opts], []). + +-spec cast(pid(), term()) -> ok. +cast(Pid, Req) -> + gen_server:cast(Pid, Req). + +-spec call(pid(), term()) -> term(). +call(Pid, Req) -> + call(Pid, Req, infinity). + +-spec call(pid(), term(), timeout()) -> term(). +call(Pid, Req, Timeout) -> + gen_server:call(Pid, Req, Timeout). + +-spec stop(pid()) -> ok. +stop(Pid) -> + gen_server:stop(Pid). + +%%-------------------------------------------------------------------- +%% gen_server API +%%-------------------------------------------------------------------- + +init([#{conninfo := OldConnInfo, clientinfo := #{clientid := ClientId} = OldClientInfo}]) -> + process_flag(trap_exit, true), + ClientInfo = clientinfo(OldClientInfo), + ConnInfo = conninfo(OldConnInfo), + case open_session(ConnInfo, ClientInfo) of + {ok, Channel0} -> + case set_expiry_timer(Channel0) of + {ok, Channel1} -> + ?SLOG( + info, + #{ + msg => "channel_initialized", + clientid => ClientId, + node => node() + } + ), + ok = emqx_cm:mark_channel_disconnected(self()), + {ok, Channel1, hibernate}; + {error, Reason} -> + {stop, Reason} + end; + {error, Reason} -> + {stop, Reason} + end. + +handle_call(kick, _From, Channel) -> + {stop, kicked, ok, Channel}; +handle_call(discard, _From, Channel) -> + {stop, discarded, ok, Channel}; +handle_call({takeover, 'begin'}, _From, #{session := Session} = Channel) -> + {reply, Session, Channel#{takeover => true}}; +handle_call( + {takeover, 'end'}, + _From, + #{ + session := Session, + clientinfo := #{clientid := ClientId}, + pendings := Pendings + } = Channel +) -> + ok = emqx_session:takeover(Session), + %% TODO: Should not drain deliver here (side effect) + Delivers = emqx_misc:drain_deliver(), + AllPendings = lists:append(Delivers, Pendings), + ?tp( + debug, + emqx_channel_takeover_end, + #{clientid => ClientId} + ), + {stop, normal, AllPendings, Channel}; +handle_call(list_acl_cache, _From, Channel) -> + {reply, [], Channel}; +handle_call({quota, _Policy}, _From, Channel) -> + {reply, ok, Channel}; +handle_call(Req, _From, Channel) -> + ?SLOG( + error, + #{ + msg => "unexpected_call", + req => Req + } + ), + {reply, ignored, Channel}. + +handle_info(Deliver = {deliver, _Topic, _Msg}, Channel) -> + Delivers = [Deliver | emqx_misc:drain_deliver()], + {noreply, handle_deliver(Delivers, Channel)}; +handle_info(expire_session, Channel) -> + {stop, expired, Channel}; +handle_info(Info, Channel) -> + ?SLOG( + error, + #{ + msg => "unexpected_info", + info => Info + } + ), + {noreply, Channel}. + +handle_cast(Msg, Channel) -> + ?SLOG(error, #{msg => "unexpected_cast", cast => Msg}), + {noreply, Channel}. + +terminate(Reason, #{conninfo := ConnInfo, clientinfo := ClientInfo, session := Session} = Channel) -> + ok = cancel_expiry_timer(Channel), + (Reason =:= expired) andalso emqx_persistent_session:persist(ClientInfo, ConnInfo, Session), + emqx_session:terminate(ClientInfo, Reason, Session). + +code_change(_OldVsn, Channel, _Extra) -> + {ok, Channel}. + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +%% TODO: sync with emqx_channel +handle_deliver( + Delivers, + #{ + takeover := true, + pendings := Pendings, + session := Session, + clientinfo := #{clientid := ClientId} = ClientInfo + } = Channel +) -> + %% NOTE: Order is important here. While the takeover is in + %% progress, the session cannot enqueue messages, since it already + %% passed on the queue to the new connection in the session state. + NPendings = lists:append( + Pendings, + emqx_session:ignore_local(ClientInfo, emqx_channel:maybe_nack(Delivers), ClientId, Session) + ), + Channel#{pendings => NPendings}; +handle_deliver( + Delivers, + #{ + takeover := false, + session := Session, + clientinfo := #{clientid := ClientId} = ClientInfo + } = Channel +) -> + Delivers1 = emqx_channel:maybe_nack(Delivers), + Delivers2 = emqx_session:ignore_local(ClientInfo, Delivers1, ClientId, Session), + NSession = emqx_session:enqueue(ClientInfo, Delivers2, Session), + NChannel = persist(NSession, Channel), + %% We consider queued/dropped messages as delivered since they are now in the session state. + emqx_channel:maybe_mark_as_delivered(Session, Delivers), + NChannel. + +cancel_expiry_timer(#{expiry_timer := TRef}) when is_reference(TRef) -> + _ = erlang:cancel_timer(TRef), + ok; +cancel_expiry_timer(_) -> + ok. + +set_expiry_timer(#{conninfo := ConnInfo} = Channel) -> + case maps:get(expiry_interval, ConnInfo) of + ?UINT_MAX -> + {ok, Channel}; + I when I > 0 -> + Timer = erlang:send_after(timer:seconds(I), self(), expire_session), + {ok, Channel#{expiry_timer => Timer}}; + _ -> + {error, should_be_expired} + end. + +open_session(ConnInfo, #{clientid := ClientId} = ClientInfo) -> + Channel = channel(ConnInfo, ClientInfo), + case emqx_cm:open_session(false, ClientInfo, ConnInfo) of + {ok, #{present := false}} -> + ?SLOG( + info, + #{ + msg => "no_session", + clientid => ClientId, + node => node() + } + ), + {error, no_session}; + {ok, #{session := Session, present := true, pendings := Pendings0}} -> + ?SLOG( + info, + #{ + msg => "session_opened", + clientid => ClientId, + node => node() + } + ), + Pendings1 = lists:usort(lists:append(Pendings0, emqx_misc:drain_deliver())), + NSession = emqx_session:enqueue( + ClientInfo, + emqx_session:ignore_local( + ClientInfo, + emqx_channel:maybe_nack(Pendings1), + ClientId, + Session + ), + Session + ), + NChannel = Channel#{session => NSession}, + ok = emqx_cm:insert_channel_info(ClientId, info(NChannel), stats(NChannel)), + ?SLOG( + info, + #{ + msg => "channel_info_updated", + clientid => ClientId, + node => node() + } + ), + {ok, NChannel}; + {error, Reason} = Error -> + ?SLOG( + error, + #{ + msg => "session_open_failed", + clientid => ClientId, + node => node(), + reason => Reason + } + ), + Error + end. + +conninfo(OldConnInfo) -> + DisconnectedAt = maps:get(disconnected_at, OldConnInfo, erlang:system_time(millisecond)), + ConnInfo0 = maps:with( + [ + socktype, + sockname, + peername, + peercert, + clientid, + clean_start, + receive_maximum, + expiry_interval, + connected_at, + disconnected_at, + keepalive + ], + OldConnInfo + ), + ConnInfo0#{ + conn_mod => ?MODULE, + connected => false, + disconnected_at => DisconnectedAt + }. + +clientinfo(OldClientInfo) -> + maps:with( + [ + zone, + protocol, + peerhost, + sockport, + clientid, + username, + is_bridge, + is_superuser, + mountpoint + ], + OldClientInfo + ). + +channel(ConnInfo, ClientInfo) -> + #{ + conninfo => ConnInfo, + clientinfo => ClientInfo, + expiry_timer => undefined, + takeover => false, + resuming => false, + pendings => [] + }. + +persist(Session, #{clientinfo := ClientInfo, conninfo := ConnInfo} = Channel) -> + Session1 = emqx_persistent_session:persist(ClientInfo, ConnInfo, Session), + Channel#{session => Session1}. + +info(Channel) -> + #{ + conninfo => maps:get(conninfo, Channel, undefined), + clientinfo => maps:get(clientinfo, Channel, undefined), + session => maybe_apply( + fun emqx_session:info/1, + maps:get(session, Channel, undefined) + ), + conn_state => disconnected + }. + +stats(#{session := Session}) -> + lists:append(emqx_session:stats(Session), emqx_pd:get_counters(?CHANNEL_METRICS)). + +bin_clientid(ClientId) when is_binary(ClientId) -> + ClientId; +bin_clientid(ClientId) when is_atom(ClientId) -> + atom_to_binary(ClientId). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl new file mode 100644 index 000000000..3ae9365e3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_cli.erl @@ -0,0 +1,30 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_cli). + +%% APIs +-export([ + load/0, + unload/0, + cli/1 +]). + +load() -> + emqx_ctl:register_command(eviction, {?MODULE, cli}, []). + +unload() -> + emqx_ctl:unregister_command(eviction). + +cli(["status"]) -> + case emqx_eviction_agent:status() of + disabled -> + emqx_ctl:print("Eviction status: disabled~n"); + {enabled, _Stats} -> + emqx_ctl:print("Eviction status: enabled~n") + end; +cli(_) -> + emqx_ctl:usage( + [{"eviction status", "Get current node eviction status"}] + ). diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl new file mode 100644 index 000000000..195555bd3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_conn_sup.erl @@ -0,0 +1,21 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_conn_sup). + +-behaviour(supervisor). + +-export([start_link/0]). + +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + {ok, + { + #{strategy => one_for_one, intensity => 10, period => 3600}, + [] + }}. diff --git a/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl new file mode 100644 index 000000000..8b774ef85 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/emqx_eviction_agent_sup.erl @@ -0,0 +1,34 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_sup). + +-behaviour(supervisor). + +-export([start_link/0]). + +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + Childs = [ + child_spec(worker, emqx_eviction_agent, []), + child_spec(supervisor, emqx_eviction_agent_conn_sup, []) + ], + {ok, { + #{strategy => one_for_one, intensity => 10, period => 3600}, + Childs + }}. + +child_spec(Type, Mod, Args) -> + #{ + id => Mod, + start => {Mod, start_link, Args}, + restart => permanent, + shutdown => 5000, + type => Type, + modules => [Mod] + }. diff --git a/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl b/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl new file mode 100644 index 000000000..f4c958150 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/src/proto/emqx_eviction_agent_proto_v1.erl @@ -0,0 +1,27 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + evict_session_channel/4 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.0.22". + +-spec evict_session_channel( + node(), + emqx_types:clientid(), + emqx_types:conninfo(), + emqx_types:clientinfo() +) -> supervisor:startchild_err() | emqx_rpc:badrpc(). +evict_session_channel(Node, ClientId, ConnInfo, ClientInfo) -> + rpc:call(Node, emqx_eviction_agent, evict_session_channel, [ClientId, ConnInfo, ClientInfo]). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl new file mode 100644 index 000000000..0574ccec3 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl @@ -0,0 +1,403 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect/0, emqtt_connect/1, emqtt_connect/2] +). + +-define(assertPrinted(Printed, Code), + ?assertMatch( + {match, _}, + re:run(Code, Printed) + ) +). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(_Config) -> + emqx_common_test_helpers:stop_apps([emqx_eviction_agent]). + +init_per_testcase(Case, Config) -> + _ = emqx_eviction_agent:disable(test_eviction), + ok = snabbkaffe:start_trace(), + start_slave(Case, Config). + +start_slave(t_explicit_session_takeover, Config) -> + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [{evacuate_test1, 2883}, {evacuate_test2, 3883}], + [emqx_eviction_agent] + ), + [{evacuate_nodes, ClusterNodes} | Config]; +start_slave(_Case, Config) -> + Config. + +end_per_testcase(TestCase, Config) -> + emqx_eviction_agent:disable(test_eviction), + ok = snabbkaffe:stop(), + stop_slave(TestCase, Config). + +stop_slave(t_explicit_session_takeover, Config) -> + emqx_eviction_agent_test_helpers:stop_cluster( + ?config(evacuate_nodes, Config), + [emqx_eviction_agent] + ); +stop_slave(_Case, _Config) -> + ok. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_enable_disable(_Config) -> + erlang:process_flag(trap_exit, true), + + ?assertMatch( + disabled, + emqx_eviction_agent:status() + ), + + {ok, C0} = emqtt_connect(), + ok = emqtt:disconnect(C0), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertMatch( + {error, eviction_agent_busy}, + emqx_eviction_agent:enable(bar, undefined) + ), + + ?assertMatch( + ok, + emqx_eviction_agent:enable(test_eviction, <<"srv">>) + ), + + ?assertMatch( + {enabled, #{}}, + emqx_eviction_agent:status() + ), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_connect() + ), + + ?assertMatch( + {error, eviction_agent_busy}, + emqx_eviction_agent:disable(bar) + ), + + ?assertMatch( + ok, + emqx_eviction_agent:disable(test_eviction) + ), + + ?assertMatch( + {error, disabled}, + emqx_eviction_agent:disable(test_eviction) + ), + + ?assertMatch( + disabled, + emqx_eviction_agent:status() + ), + + {ok, C1} = emqtt_connect(), + ok = emqtt:disconnect(C1). + +t_evict_connections_status(_Config) -> + erlang:process_flag(trap_exit, true), + + {ok, _C} = emqtt_connect(), + + {error, disabled} = emqx_eviction_agent:evict_connections(1), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertMatch( + {enabled, #{connections := 1, sessions := _}}, + emqx_eviction_agent:status() + ), + + ok = emqx_eviction_agent:evict_connections(1), + + ct:sleep(100), + + ?assertMatch( + {enabled, #{connections := 0, sessions := _}}, + emqx_eviction_agent:status() + ), + + ok = emqx_eviction_agent:disable(test_eviction). + +t_explicit_session_takeover(Config) -> + _ = erlang:process_flag(trap_exit, true), + ok = restart_emqx(), + + [{Node1, Port1}, {Node2, _Port2}] = ?config(evacuate_nodes, Config), + + {ok, C0} = emqtt_connect([ + {clientid, <<"client_with_session">>}, + {clean_start, false}, + {port, Port1} + ]), + {ok, _, _} = emqtt:subscribe(C0, <<"t1">>), + + ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]), + + ?assertEqual( + 1, + rpc:call(Node1, emqx_eviction_agent, connection_count, []) + ), + + [ChanPid] = rpc:call(Node1, emqx_cm, lookup_channels, [<<"client_with_session">>]), + + ?assertWaitEvent( + begin + ok = rpc:call(Node1, emqx_eviction_agent, evict_connections, [1]), + receive + {'EXIT', C0, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok + after 1000 -> + ?assert(false, "Connection not evicted") + end + end, + #{?snk_kind := emqx_cm_connected_client_count_dec, chan_pid := ChanPid}, + 2000 + ), + + ?assertEqual( + 0, + rpc:call(Node1, emqx_eviction_agent, connection_count, []) + ), + + ?assertEqual( + 1, + rpc:call(Node1, emqx_eviction_agent, session_count, []) + ), + + %% First, evacuate to the same node + + ?assertWaitEvent( + rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node1]), + #{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>}, + 1000 + ), + + ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]), + + {ok, C1} = emqtt_connect([{port, Port1}]), + emqtt:publish(C1, <<"t1">>, <<"MessageToEvictedSession1">>), + ok = emqtt:disconnect(C1), + + ok = rpc:call(Node1, emqx_eviction_agent, enable, [test_eviction, undefined]), + + %% Evacuate to another node + + ?assertWaitEvent( + rpc:call(Node1, emqx_eviction_agent, evict_sessions, [1, Node2]), + #{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>}, + 1000 + ), + + ?assertEqual( + 0, + rpc:call(Node1, emqx_eviction_agent, session_count, []) + ), + + ?assertEqual( + 1, + rpc:call(Node2, emqx_eviction_agent, session_count, []) + ), + + ok = rpc:call(Node1, emqx_eviction_agent, disable, [test_eviction]), + + %% Session is on Node2, but we connect to Node1 + {ok, C2} = emqtt_connect([{port, Port1}]), + emqtt:publish(C2, <<"t1">>, <<"MessageToEvictedSession2">>), + ok = emqtt:disconnect(C2), + + ct:sleep(100), + + %% Session is on Node2, but we connect the subscribed client to Node1 + %% It should take over the session for the third time and recieve + %% previously published messages + {ok, C3} = emqtt_connect([ + {clientid, <<"client_with_session">>}, + {clean_start, false}, + {port, Port1} + ]), + + ok = assert_receive_publish( + [ + #{payload => <<"MessageToEvictedSession1">>, topic => <<"t1">>}, + #{payload => <<"MessageToEvictedSession2">>, topic => <<"t1">>} + ] + ), + ok = emqtt:disconnect(C3). + +t_disable_on_restart(_Config) -> + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ok = supervisor:terminate_child(emqx_eviction_agent_sup, emqx_eviction_agent), + {ok, _} = supervisor:restart_child(emqx_eviction_agent_sup, emqx_eviction_agent), + + ?assertEqual( + disabled, + emqx_eviction_agent:status() + ). + +t_session_serialization(_Config) -> + _ = erlang:process_flag(trap_exit, true), + ok = restart_emqx(), + + {ok, C0} = emqtt_connect(<<"client_with_session">>, false), + {ok, _, _} = emqtt:subscribe(C0, <<"t1">>), + ok = emqtt:disconnect(C0), + + ok = emqx_eviction_agent:enable(test_eviction, undefined), + + ?assertEqual( + 1, + emqx_eviction_agent:session_count() + ), + + %% Evacuate to the same node + + ?assertWaitEvent( + emqx_eviction_agent:evict_sessions(1, node()), + #{?snk_kind := emqx_channel_takeover_end, clientid := <<"client_with_session">>}, + 1000 + ), + + ok = emqx_eviction_agent:disable(test_eviction), + + ?assertEqual( + 1, + emqx_eviction_agent:session_count() + ), + + ?assertMatch( + #{data := [#{clientid := <<"client_with_session">>}]}, + emqx_mgmt_api:cluster_query( + emqx_channel_info, + #{}, + [], + fun emqx_mgmt_api_clients:qs2ms/2, + fun emqx_mgmt_api_clients:format_channel_info/2 + ) + ), + + mock_print(), + + ?assertPrinted( + "client_with_session", + emqx_mgmt_cli:clients(["list"]) + ), + + ?assertPrinted( + "client_with_session", + emqx_mgmt_cli:clients(["show", "client_with_session"]) + ), + + ?assertWaitEvent( + emqx_cm:kick_session(<<"client_with_session">>), + #{?snk_kind := emqx_cm_clean_down, client_id := <<"client_with_session">>}, + 1000 + ), + + ?assertEqual( + 0, + emqx_eviction_agent:session_count() + ). + +t_will_msg(_Config) -> + erlang:process_flag(trap_exit, true), + + WillMsg = <<"will_msg">>, + WillTopic = <<"will_topic">>, + ClientId = <<"client_with_will">>, + + _ = emqtt_connect([ + {clean_start, false}, + {clientid, ClientId}, + {will_payload, WillMsg}, + {will_topic, WillTopic} + ]), + + {ok, C} = emqtt_connect(), + {ok, _, _} = emqtt:subscribe(C, WillTopic), + + [ChanPid] = emqx_cm:lookup_channels(ClientId), + + ChanPid ! + {disconnect, ?RC_USE_ANOTHER_SERVER, use_another_server, #{ + 'Server-Reference' => <<>> + }}, + + receive + {publish, #{ + payload := WillMsg, + topic := WillTopic + }} -> + ok + after 1000 -> + ct:fail("Will message not received") + end, + + ok = emqtt:disconnect(C). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +% sn_connect_and_subscribe(ClientId, Topic) -> +% emqx_eviction_agent_test_helpers:sn_connect_and_subscribe(ClientId, Topic). + +assert_receive_publish([]) -> + ok; +assert_receive_publish([#{payload := Msg, topic := Topic} | Rest]) -> + receive + {publish, #{ + payload := Msg, + topic := Topic + }} -> + assert_receive_publish(Rest) + after 1000 -> + ?assert(false, "Message `" ++ binary_to_list(Msg) ++ "` is lost") + end. + +connect_and_publish(Topic, Message) -> + {ok, C} = emqtt_connect(), + emqtt:publish(C, Topic, Message), + ok = emqtt:disconnect(C). + +restart_emqx() -> + _ = application:stop(emqx), + _ = application:start(emqx), + _ = application:stop(emqx_eviction_agent), + _ = application:start(emqx_eviction_agent), + ok. + +mock_print() -> + catch meck:unload(emqx_ctl), + meck:new(emqx_ctl, [non_strict, passthrough]), + meck:expect(emqx_ctl, print, fun(Arg) -> emqx_ctl:format(Arg, []) end), + meck:expect(emqx_ctl, print, fun(Msg, Arg) -> emqx_ctl:format(Msg, Arg) end), + meck:expect(emqx_ctl, usage, fun(Usages) -> emqx_ctl:format_usage(Usages) end), + meck:expect(emqx_ctl, usage, fun(Cmd, Descr) -> emqx_ctl:format_usage(Cmd, Descr) end). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl new file mode 100644 index 000000000..3fe15e53a --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_api_SUITE.erl @@ -0,0 +1,69 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import( + emqx_mgmt_api_test_util, + [ + request_api/2, + uri/1 + ] +). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_mgmt_api_test_util:init_suite([emqx_eviction_agent]), + Config. + +end_per_suite(Config) -> + emqx_mgmt_api_test_util:end_suite([emqx_eviction_agent]), + Config. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_status(_Config) -> + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["node_eviction", "status"]) + ), + + ok = emqx_eviction_agent:enable(apitest, undefined), + + ?assertMatch( + {ok, #{ + <<"status">> := <<"enabled">>, + <<"stats">> := #{} + }}, + api_get(["node_eviction", "status"]) + ), + + ok = emqx_eviction_agent:disable(apitest), + + ?assertMatch( + {ok, #{<<"status">> := <<"disabled">>}}, + api_get(["node_eviction", "status"]) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +api_get(Path) -> + case request_api(get, uri(Path)) of + {ok, ResponseBody} -> + {ok, jiffy:decode(list_to_binary(ResponseBody), [return_maps])}; + {error, _} = Error -> + Error + end. diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl new file mode 100644 index 000000000..3b7ef6672 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_channel_SUITE.erl @@ -0,0 +1,251 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_channel_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). + +-define(CLIENT_ID, <<"client_with_session">>). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect/0, emqtt_connect/2] +). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps([emqx_conf, emqx_eviction_agent]), + {ok, _} = emqx:update_config([rpc, port_discovery], manual), + Config. + +end_per_suite(_Config) -> + emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_conf]). + +init_per_testcase(t_persistence, Config) -> + emqx_config:put([persistent_session_store, enabled], true), + {ok, _} = emqx_persistent_session_sup:start_link(), + emqx_persistent_session:init_db_backend(), + ?assert(emqx_persistent_session:is_store_enabled()), + Config; +init_per_testcase(_TestCase, Config) -> + Config. + +end_per_testcase(t_persistence, Config) -> + emqx_config:put([persistent_session_store, enabled], false), + emqx_persistent_session:init_db_backend(), + ?assertNot(emqx_persistent_session:is_store_enabled()), + Config; +end_per_testcase(_TestCase, _Config) -> + ok. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_start_no_session(_Config) -> + Opts = #{ + clientinfo => #{ + clientid => ?CLIENT_ID, + zone => internal + }, + conninfo => #{ + clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => 10000 + } + }, + ?assertMatch( + {error, {no_session, _}}, + emqx_eviction_agent_channel:start_supervised(Opts) + ). + +t_start_no_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + Opts = #{ + clientinfo => #{ + clientid => ?CLIENT_ID, + zone => internal + }, + conninfo => #{ + clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => 0 + } + }, + ?assertMatch( + {error, {should_be_expired, _}}, + emqx_eviction_agent_channel:start_supervised(Opts) + ). + +t_start_infinite_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + Opts = #{ + clientinfo => #{ + clientid => ?CLIENT_ID, + zone => internal + }, + conninfo => #{ + clientid => ?CLIENT_ID, + receive_maximum => 32, + expiry_interval => ?UINT_MAX + } + }, + ?assertMatch( + {ok, _}, + emqx_eviction_agent_channel:start_supervised(Opts) + ). + +t_kick(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, kick) + ). + +t_discard(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, discard) + ). + +t_stop(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:stop(Pid) + ). + +t_ignored_calls(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + Opts = evict_session_opts(?CLIENT_ID), + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + ok = emqx_eviction_agent_channel:cast(Pid, unknown), + Pid ! unknown, + + ?assertEqual( + [], + emqx_eviction_agent_channel:call(Pid, list_acl_cache) + ), + + ?assertEqual( + ok, + emqx_eviction_agent_channel:call(Pid, {quota, quota}) + ), + + ?assertEqual( + ignored, + emqx_eviction_agent_channel:call(Pid, unknown) + ). + +t_expire(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + #{conninfo := ConnInfo} = Opts0 = evict_session_opts(?CLIENT_ID), + Opts1 = Opts0#{conninfo => ConnInfo#{expiry_interval => 1}}, + + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts1), + + ct:sleep(1500), + + ?assertNot(is_process_alive(Pid)). + +t_get_connected_client_count(_Config) -> + erlang:process_flag(trap_exit, true), + + _ = emqtt_connect(?CLIENT_ID, false), + + ?assertEqual( + 1, + emqx_cm:get_connected_client_count() + ), + + Opts = evict_session_opts(?CLIENT_ID), + + {ok, _} = emqx_eviction_agent_channel:start_supervised(Opts), + + ?assertEqual( + 0, + emqx_cm:get_connected_client_count() + ). + +t_persistence(_Config) -> + erlang:process_flag(trap_exit, true), + + Topic = <<"t1">>, + Message = <<"message_to_persist">>, + + {ok, C0} = emqtt_connect(?CLIENT_ID, false), + {ok, _, _} = emqtt:subscribe(C0, Topic, 0), + + Opts = evict_session_opts(?CLIENT_ID), + {ok, Pid} = emqx_eviction_agent_channel:start_supervised(Opts), + + {ok, C1} = emqtt_connect(), + {ok, _} = emqtt:publish(C1, Topic, Message, 1), + ok = emqtt:disconnect(C1), + + %% Kill channel so that the session is only persisted + ok = emqx_eviction_agent_channel:call(Pid, kick), + + %% Should restore session from persistents storage and receive messages + {ok, C2} = emqtt_connect(?CLIENT_ID, false), + + receive + {publish, #{ + payload := Message, + topic := Topic + }} -> + ok + after 1000 -> + ct:fail("message not received") + end, + + ok = emqtt:disconnect(C2). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +evict_session_opts(ClientId) -> + maps:with( + [conninfo, clientinfo], + emqx_cm:get_chan_info(ClientId) + ). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl new file mode 100644 index 000000000..4cfb2fff5 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_cli_SUITE.erl @@ -0,0 +1,39 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_cli_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps([emqx_eviction_agent]), + Config. + +end_per_suite(Config) -> + _ = emqx_eviction_agent:disable(foo), + emqx_common_test_helpers:stop_apps([emqx_eviction_agent]), + Config. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_status(_Config) -> + %% usage + ok = emqx_eviction_agent_cli:cli(["foobar"]), + + %% status + ok = emqx_eviction_agent_cli:cli(["status"]), + + ok = emqx_eviction_agent:enable(foo, undefined), + + %% status + ok = emqx_eviction_agent_cli:cli(["status"]). diff --git a/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl new file mode 100644 index 000000000..8f88ebf97 --- /dev/null +++ b/lib-ee/emqx_eviction_agent/test/emqx_eviction_agent_test_helpers.erl @@ -0,0 +1,141 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_eviction_agent_test_helpers). + +-export([ + emqtt_connect/0, + emqtt_connect/1, + emqtt_connect/2, + emqtt_connect_many/2, + stop_many/1, + + emqtt_try_connect/1, + + start_cluster/2, + start_cluster/3, + stop_cluster/2, + + case_specific_node_name/2, + case_specific_node_name/3, + concat_atoms/1 +]). + +emqtt_connect() -> + emqtt_connect(<<"client1">>, true). + +emqtt_connect(ClientId, CleanStart) -> + emqtt_connect([{clientid, ClientId}, {clean_start, CleanStart}]). + +emqtt_connect(Opts) -> + {ok, C} = emqtt:start_link( + Opts ++ + [ + {proto_ver, v5}, + {properties, #{'Session-Expiry-Interval' => 600}} + ] + ), + case emqtt:connect(C) of + {ok, _} -> {ok, C}; + {error, _} = Error -> Error + end. + +emqtt_connect_many(Port, Count) -> + lists:map( + fun(N) -> + NBin = integer_to_binary(N), + ClientId = <<"client-", NBin/binary>>, + {ok, C} = emqtt_connect([{clientid, ClientId}, {clean_start, false}, {port, Port}]), + C + end, + lists:seq(1, Count) + ). + +stop_many(Clients) -> + lists:foreach( + fun(C) -> + catch emqtt:disconnect(C) + end, + Clients + ), + ct:sleep(100). + +emqtt_try_connect(Opts) -> + case emqtt_connect(Opts) of + {ok, C} -> + emqtt:disconnect(C), + ok; + {error, _} = Error -> + Error + end. + +start_cluster(NamesWithPorts, Apps) -> + start_cluster(NamesWithPorts, Apps, []). + +start_cluster(NamesWithPorts, Apps, Env) -> + Specs = lists:map( + fun({ShortName, Port}) -> + {core, ShortName, #{listener_ports => [{tcp, Port}]}} + end, + NamesWithPorts + ), + Opts0 = [ + {env, [{emqx, boot_modules, [broker, listeners]}]}, + {apps, Apps}, + {conf, + [{[listeners, Proto, default, enabled], false} || Proto <- [ssl, ws, wss]] ++ + [{[rpc, mode], async}]}, + {env, Env} + ], + Cluster = emqx_common_test_helpers:emqx_cluster( + Specs, + Opts0 + ), + NodesWithPorts = [ + { + emqx_common_test_helpers:start_slave(Name, Opts), + proplists:get_value(Name, NamesWithPorts) + } + || {Name, Opts} <- Cluster + ], + ok = lists:foreach( + fun({Node, _Port}) -> + snabbkaffe:forward_trace(Node) + end, + NodesWithPorts + ), + NodesWithPorts. + +stop_cluster(NodesWithPorts, Apps) -> + lists:foreach( + fun({Node, _Port}) -> + lists:foreach( + fun(App) -> + rpc:call(Node, application, stop, [App]) + end, + Apps + ), + %% This sleep is just to make logs cleaner + ct:sleep(100), + _ = rpc:call(Node, emqx_common_test_helpers, stop_apps, []), + emqx_common_test_helpers:stop_slave(Node) + end, + NodesWithPorts + ). + +case_specific_node_name(Module, Case) -> + concat_atoms([Module, '__', Case]). + +case_specific_node_name(Module, Case, Node) -> + concat_atoms([Module, '__', Case, '__', Node]). + +concat_atoms(Atoms) -> + binary_to_atom( + iolist_to_binary( + lists:map( + fun atom_to_binary/1, + Atoms + ) + ) + ). diff --git a/lib-ee/emqx_node_rebalance/README.md b/lib-ee/emqx_node_rebalance/README.md new file mode 100644 index 000000000..2e56f62cd --- /dev/null +++ b/lib-ee/emqx_node_rebalance/README.md @@ -0,0 +1,9 @@ +emqx_node_rebalance +===== + +An OTP library + +Build +----- + + $ rebar3 compile diff --git a/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf b/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf new file mode 100644 index 000000000..8ace22435 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/etc/emqx_node_rebalance.conf @@ -0,0 +1,3 @@ +##-------------------------------------------------------------------- +## EMQX Node Rebalance Plugin +##-------------------------------------------------------------------- diff --git a/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf b/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf new file mode 100644 index 000000000..f5f161a92 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/i18n/emqx_node_rebalance_api_i18n.conf @@ -0,0 +1,490 @@ +emqx_node_rebalance_api { + + ## API Request Fields + + load_rebalance_status { + desc { + en: "Get rebalance status of the current node" + zh: "获取当前节点的rebalance状态" + } + label { + en: "Get rebalance status" + zh: "获取rebalance状态" + } + } + + load_rebalance_global_status { + desc { + en: "Get status of all rebalance/evacuation processes across the cluster" + zh: "获取集群中所有rebalance/evacuation进程的状态" + } + label { + en: "Get global rebalance status" + zh: "获取全局rebalance状态" + } + } + + load_rebalance_availability_check { + desc { + en: "Check if the node is being evacuated or rebalanced" + zh: "检查节点是否正在被evacuate或rebalance" + } + label { + en: "Availability check" + zh: "可用性检查" + } + } + + load_rebalance_start { + desc { + en: "Start rebalance process" + zh: "启动rebalance进程" + } + label { + en: "Start rebalance" + zh: "启动rebalance" + } + } + + load_rebalance_stop { + desc { + en: "Stop rebalance process" + zh: "停止rebalance进程" + } + label { + en: "Stop rebalance" + zh: "停止rebalance" + } + } + + load_rebalance_evacuation_start { + desc { + en: "Start evacuation process" + zh: "启动evacuation进程" + } + label { + en: "Start evacuation" + zh: "启动evacuation" + } + } + + load_rebalance_evacuation_stop { + desc { + en: "Stop evacuation process" + zh: "停止evacuation进程" + } + label { + en: "Stop evacuation" + zh: "停止evacuation" + } + } + + param_node { + desc { + en: "Node name" + zh: "节点名称" + } + label { + en: "Node name" + zh: "节点名称" + } + } + + wait_health_check { + desc { + en: "Time to wait before starting the rebalance process, in seconds" + zh: "启动rebalance进程前等待的时间,单位为秒" + } + label { + en: "Wait health check" + zh: "等待健康检查" + } + } + + conn_evict_rate { + desc { + en: "The rate of evicting connections, in connections per second" + zh: "逐出连接的速率,以每秒连接数表示" + } + label { + en: "Connection eviction rate" + zh: "连接驱逐率" + } + } + + sess_evict_rate { + desc { + en: "The rate of evicting sessions, in sessions per second" + zh: "逐出会话的速率,以每秒会话为单位" + } + label { + en: "Session eviction rate" + zh: "会话驱逐率" + } + } + + abs_conn_threshold { + desc { + en: "Maximum desired difference between the number of connections on the node and the average number of connections on the recipient nodes" + zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望差值" + } + label { + en: "Absolute connection threshold" + zh: "绝对连接阈值" + } + } + + rel_conn_threshold { + desc { + en: "Maximum desired fraction between the number of connections on the node and the average number of connections on the recipient nodes" + zh: "节点上的连接数与接收节点上的平均连接数之间的最大期望分数" + } + label { + en: "Relative connection threshold" + zh: "相对连接阈值" + } + } + + abs_sess_threshold { + desc { + en: "Maximum desired difference between the number of sessions on the node and the average number of sessions on the recipient nodes" + zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望差异" + } + label { + en: "Absolute session threshold" + zh: "绝对会话阈值" + } + } + + rel_sess_threshold { + desc { + en: "Maximum desired fraction between the number of sessions on the node and the average number of sessions on the recipient nodes" + zh: "节点上的会话数与接收节点上的平均会话数之间的最大期望分数" + } + label { + en: "Relative session threshold" + zh: "相对会话阈值" + } + } + + wait_takeover { + desc { + en: "Time to wait before starting session evacuation process, in seconds" + zh: "开始会话疏散过程之前等待的时间,以秒为单位" + } + label { + en: "Wait takeover" + zh: "等待接管" + } + } + + redirect_to { + desc { + en: "Server reference to redirect clients to (MQTTv5 Server redirection)" + zh: "将客户端重定向到的服务器参考(MQTTv5 服务器重定向)" + } + label { + en: "Redirect to" + zh: "重定向至" + } + } + + migrate_to { + desc { + en: "Nodes to migrate sessions to" + zh: "将会话迁移到的节点" + } + label { + en: "Migrate to" + zh: "迁移到" + } + } + + rebalance_nodes { + desc { + en: "Nodes to participate in rebalance" + zh: "参与rebalance的节点" + } + label { + en: "Rebalance nodes" + zh: "重新平衡节点" + } + } + + ## API Response Fields + + local_status_enabled { + desc { + en: "Whether the node is being evacuated" + zh: "节点是否正在撤离" + } + label { + en: "Local evacuation status" + zh: "当地避难状况" + } + } + + local_status_process { + desc { + en: "The process that is being performed on the node: evacuation or rebalance" + zh: "正在节点上执行的过程:疏散或重新平衡" + } + label { + en: "Node process" + zh: "节点进程" + } + } + + local_status_state { + desc { + en: "The state of the process that is being performed on the node" + zh: "正在节点上执行的进程的状态" + } + label { + en: "Rebalance/evacuation current state" + zh: "重新平衡/疏散当前状态" + } + } + + local_status_coordinator_node { + desc { + en: "The node that is coordinating rebalance process" + zh: "协调再平衡过程的节点" + } + label { + en: "Coordinator node" + zh: "协调节点" + } + } + + local_status_connection_eviction_rate { + desc { + en: "The rate of evicting connections, in connections per second" + zh: "逐出连接的速率,以每秒连接数表示" + } + label { + en: "Connection eviction rate" + zh: "连接驱逐率" + } + } + + local_status_session_eviction_rate { + desc { + en: "The rate of evicting sessions, in sessions per second" + zh: "逐出会话的速率,以每秒会话为单位" + } + label { + en: "Session eviction rate" + zh: "会话驱逐率" + } + } + + local_status_connection_goal { + desc { + en: "The number of connections that the node should have after the rebalance/evacuation process" + zh: "节点在重新平衡/疏散过程后应该拥有的连接数" + } + label { + en: "Connection goal" + zh: "连接目标" + } + } + + local_status_session_goal { + desc { + en: "The number of sessions that the node should have after the evacuation process" + zh: "疏散过程后节点应有的会话数" + } + label { + en: "Session goal" + zh: "会话目标" + } + } + + local_status_disconnected_session_goal { + desc { + en: "The number of disconnected sessions that the node should have after the rebalance process" + zh: "重新平衡过程后节点应具有的断开连接的会话数" + } + label { + en: "Disconnected session goal" + zh: "断开连接的会话目标" + } + } + + local_status_session_recipients { + desc { + en: "List of nodes to which sessions are being evacuated" + zh: "会话被疏散到的节点列表" + } + label { + en: "Session recipients" + zh: "会话收件人" + } + } + + local_status_recipients { + desc { + en: "List of nodes to which connections/sessions are being evacuated during rebalance" + zh: "在重新平衡期间连接/会话被疏散到的节点列表" + } + label { + en: "Recipients" + zh: "收件人" + } + } + + local_status_stats { + desc { + en: "Statistics of the evacuation/rebalance process" + zh: "疏散/再平衡过程的统计" + } + label { + en: "Statistics" + zh: "统计数据" + } + } + + status_stats_initial_connected { + desc { + en: "The number of connections on the node before the evacuation/rebalance process" + zh: "疏散/重新平衡过程之前节点上的连接数" + } + label { + en: "Initial connected" + zh: "初始连接" + } + } + + status_stats_current_connected { + desc { + en: "Current number of connections on the node" + zh: "节点上的当前连接数" + } + label { + en: "Current connections" + zh: "当前连接" + } + } + + status_stats_initial_sessions { + desc { + en: "The number of sessions on the node before the evacuation/rebalance process" + zh: "疏散/重新平衡过程之前节点上的会话数" + } + label { + en: "Initial sessions" + zh: "初始会话" + } + } + + status_stats_current_sessions { + desc { + en: "Current number of sessions on the node" + zh: "节点上的当前会话数" + } + label { + en: "Current sessions" + zh: "当前会话" + } + } + + status_stats_current_disconnected_sessions { + desc { + en: "Current number of disconnected sessions on the node" + zh: "节点上当前断开连接的会话数" + } + label { + en: "Current disconnected sessions" + zh: "当前断开连接的会话" + } + } + + coordinator_status_donors { + desc { + en: "List of nodes from which connections/sessions are being evacuated" + zh: "正在疏散连接/会话的节点列表" + } + label { + en: "Donors" + zh: "捐助者" + } + } + + coordinator_status_donor_conn_avg { + desc { + en: "Average number of connections per donor node" + zh: "每个供体节点的平均连接数" + } + label { + en: "Donor connections average" + zh: "捐助者连接平均值" + } + } + + coordinator_status_donor_sess_avg { + desc { + en: "Average number of sessions per donor node" + zh: "每个供体节点的平均会话数" + } + label { + en: "Donor sessions average" + zh: "平均捐助会议" + } + } + + coordinator_status_node { + desc { + en: "The node that is coordinating the evacuation/rebalance process" + zh: "协调疏散/再平衡过程的节点" + } + label { + en: "Coordinator node" + zh: "协调节点" + } + } + + evacuation_status_node { + desc { + en: "The node that is being evacuated" + zh: "正在撤离的节点" + } + label { + en: "Evacuated node" + zh: "疏散节点" + } + } + + global_status_evacuations { + desc { + en: "List of nodes that are being evacuated" + zh: "正在撤离的节点列表" + } + label { + en: "Evacuations" + zh: "疏散" + } + } + + global_status_rebalances { + desc { + en: "List of nodes that coordinate a rebalance" + zh: "协调再平衡的节点列表" + } + label { + en: "Rebalances" + zh: "再平衡" + } + } + + empty_response { + desc { + en: "The response is empty" + zh: "响应为空" + } + label { + en: "Empty response" + zh: "空响应" + } + } +} diff --git a/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl b/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl new file mode 100644 index 000000000..ccc671e81 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/include/emqx_node_rebalance.hrl @@ -0,0 +1,33 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%%-------------------------------------------------------------------- + +-define(DEFAULT_CONN_EVICT_RATE, 500). +-define(DEFAULT_SESS_EVICT_RATE, 500). + +%% sec +-define(DEFAULT_WAIT_HEALTH_CHECK, 60). +%% sec +-define(DEFAULT_WAIT_TAKEOVER, 60). + +-define(DEFAULT_ABS_CONN_THRESHOLD, 1000). +-define(DEFAULT_ABS_SESS_THRESHOLD, 1000). + +-define(DEFAULT_REL_CONN_THRESHOLD, 1.1). +-define(DEFAULT_REL_SESS_THRESHOLD, 1.1). + +-define(EVICT_INTERVAL, 1000). + +-define(EVACUATION_FILENAME, <<".evacuation">>). diff --git a/lib-ee/emqx_node_rebalance/rebar.config b/lib-ee/emqx_node_rebalance/rebar.config new file mode 100644 index 000000000..b055d8f4f --- /dev/null +++ b/lib-ee/emqx_node_rebalance/rebar.config @@ -0,0 +1,2 @@ +{deps, [{emqx, {path, "../../apps/emqx"}}]}. +{project_plugins, [erlfmt]}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src new file mode 100644 index 000000000..9673e4fda --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.app.src @@ -0,0 +1,22 @@ +{application, emqx_node_rebalance, [ + {description, "EMQX Node Rebalance"}, + {vsn, "5.0.0"}, + {registered, [ + emqx_node_rebalance_sup, + emqx_node_rebalance, + emqx_node_rebalance_agent, + emqx_node_rebalance_evacuation + ]}, + {applications, [ + kernel, + stdlib + ]}, + {mod, {emqx_node_rebalance_app, []}}, + {env, []}, + {modules, []}, + {maintainers, ["EMQX Team "]}, + {links, [ + {"Homepage", "https://emqx.io/"}, + {"Github", "https://github.com/emqx"} + ]} +]}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src new file mode 100644 index 000000000..c1b84778d --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.appup.src @@ -0,0 +1,3 @@ +%% -*- mode: erlang -*- +%% Unless you know what you are doing, DO NOT edit manually!! +{VSN, [{<<".*">>, []}], [{<<".*">>, []}]}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl new file mode 100644 index 000000000..1f2adc565 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance.erl @@ -0,0 +1,438 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start/1, + status/0, + status/1, + stop/0 +]). + +-export([start_link/0]). + +-behaviour(gen_statem). + +-export([ + init/1, + callback_mode/0, + handle_event/4, + code_change/4 +]). + +-export([ + is_node_available/0, + available_nodes/1, + connection_count/0, + session_count/0, + disconnected_session_count/0 +]). + +-export_type([ + start_opts/0, + start_error/0 +]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type start_opts() :: #{ + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_health_check => pos_integer(), + wait_takeover => pos_integer(), + abs_conn_threshold => pos_integer(), + rel_conn_threshold => number(), + abs_sess_threshold => pos_integer(), + rel_sess_threshold => number(), + nodes => [node()] +}. +-type start_error() :: already_started | [{node(), term()}]. + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> disabled | {enabled, map()}. +status() -> + gen_statem:call(?MODULE, status). + +-spec status(pid()) -> disabled | {enabled, map()}. +status(Pid) -> + gen_statem:call(Pid, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec available_nodes(list(node())) -> list(node()). +available_nodes(Nodes) when is_list(Nodes) -> + {Available, _} = emqx_node_rebalance_proto_v1:available_nodes(Nodes), + lists:filter(fun is_atom/1, Available). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, wait_health_check, evicting_conns, wait_takeover, evicting_sessions + +init([]) -> + ?tp(debug, emqx_node_rebalance_started, #{}), + {ok, disabled, #{}}. + +%% start +handle_event( + {call, From}, + {start, #{wait_health_check := WaitHealthCheck} = Opts}, + disabled, + #{} = Data +) -> + case enable_rebalance(Data#{opts => Opts}) of + {ok, NewData} -> + ?SLOG(warning, #{msg => "node_rebalance_enabled", opts => Opts}), + {next_state, wait_health_check, NewData, [ + {state_timeout, seconds(WaitHealthCheck), evict_conns}, + {reply, From, ok} + ]}; + {error, Reason} -> + ?SLOG(warning, #{ + msg => "node_rebalance_enable_failed", + reason => Reason + }), + {keep_state_and_data, [{reply, From, {error, Reason}}]} + end; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, [{reply, From, {error, already_started}}]}; +%% stop +handle_event({call, From}, stop, disabled, #{}) -> + {keep_state_and_data, [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = disable_rebalance(Data), + ?SLOG(warning, #{msg => "node_rebalance_stopped"}), + {next_state, disabled, deinit(Data), [{reply, From, ok}]}; +%% status +handle_event({call, From}, status, disabled, #{}) -> + {keep_state_and_data, [{reply, From, disabled}]}; +handle_event({call, From}, status, State, Data) -> + Stats = get_stats(State, Data), + {keep_state_and_data, [ + {reply, From, + {enabled, Stats#{ + state => State, + coordinator_node => node() + }}} + ]}; +%% conn eviction +handle_event( + state_timeout, + evict_conns, + wait_health_check, + Data +) -> + ?SLOG(warning, #{msg => "node_rebalance_wait_health_check_over"}), + {next_state, evicting_conns, Data, [{state_timeout, 0, evict_conns}]}; +handle_event( + state_timeout, + evict_conns, + evicting_conns, + #{ + opts := #{ + wait_takeover := WaitTakeover, + evict_interval := EvictInterval + } + } = Data +) -> + case evict_conns(Data) of + ok -> + ?SLOG(warning, #{msg => "node_rebalance_evict_conns_over"}), + {next_state, wait_takeover, Data, [ + {state_timeout, seconds(WaitTakeover), evict_sessions} + ]}; + {continue, NewData} -> + {keep_state, NewData, [{state_timeout, EvictInterval, evict_conns}]} + end; +handle_event( + state_timeout, + evict_sessions, + wait_takeover, + Data +) -> + ?SLOG(warning, #{msg => "node_rebalance_wait_takeover_over"}), + {next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]}; +handle_event( + state_timeout, + evict_sessions, + evicting_sessions, + #{opts := #{evict_interval := EvictInterval}} = Data +) -> + case evict_sessions(Data) of + ok -> + ?tp(debug, emqx_node_rebalance_evict_sess_over, #{}), + ?SLOG(warning, #{msg => "node_rebalance_evict_sessions_over"}), + ok = disable_rebalance(Data), + ?SLOG(warning, #{msg => "node_rebalance_finished_successfully"}), + {next_state, disabled, deinit(Data)}; + {continue, NewData} -> + {keep_state, NewData, [{state_timeout, EvictInterval, evict_sessions}]} + end; +handle_event({call, From}, Msg, _State, _Data) -> + ?SLOG(warning, #{msg => "node_rebalance_unknown_call", call => Msg}), + {keep_state_and_data, [{reply, From, ignored}]}; +handle_event(info, Msg, _State, _Data) -> + ?SLOG(warning, #{msg => "node_rebalance_unknown_info", info => Msg}), + keep_state_and_data; +handle_event(cast, Msg, _State, _Data) -> + ?SLOG(warning, #{msg => "node_rebalance_unknown_cast", cast => Msg}), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +enable_rebalance(#{opts := Opts} = Data) -> + Nodes = maps:get(nodes, Opts), + ConnCounts = multicall(Nodes, connection_counts, []), + SessCounts = multicall(Nodes, session_counts, []), + {_, Counts} = lists:unzip(ConnCounts), + Avg = avg(Counts), + {DonorCounts, RecipientCounts} = lists:partition( + fun({_Node, Count}) -> + Count >= Avg + end, + ConnCounts + ), + ?SLOG(warning, #{ + msg => "node_rebalance_enabling", + conn_counts => ConnCounts, + donor_counts => DonorCounts, + recipient_counts => RecipientCounts + }), + {DonorNodes, _} = lists:unzip(DonorCounts), + {RecipientNodes, _} = lists:unzip(RecipientCounts), + case need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) of + false -> + {error, nothing_to_balance}; + true -> + _ = multicall(DonorNodes, enable_rebalance_agent, [self()]), + {ok, Data#{ + donors => DonorNodes, + recipients => RecipientNodes, + initial_conn_counts => maps:from_list(ConnCounts), + initial_sess_counts => maps:from_list(SessCounts) + }} + end. + +disable_rebalance(#{donors := DonorNodes}) -> + _ = multicall(DonorNodes, disable_rebalance_agent, [self()]), + ok. + +evict_conns(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) -> + DonorNodeCounts = multicall(DonorNodes, connection_counts, []), + {_, DonorCounts} = lists:unzip(DonorNodeCounts), + RecipientNodeCounts = multicall(RecipientNodes, connection_counts, []), + {_, RecipientCounts} = lists:unzip(RecipientNodeCounts), + + DonorAvg = avg(DonorCounts), + RecipientAvg = avg(RecipientCounts), + Thresholds = thresholds(conn, Opts), + NewData = Data#{ + donor_conn_avg => DonorAvg, + recipient_conn_avg => RecipientAvg, + donor_conn_counts => maps:from_list(DonorNodeCounts), + recipient_conn_counts => maps:from_list(RecipientNodeCounts) + }, + case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of + true -> + ok; + false -> + ConnEvictRate = maps:get(conn_evict_rate, Opts), + NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts), + ?SLOG(warning, #{ + msg => "node_rebalance_evict_conns", + nodes => NodesToEvict, + counts => ConnEvictRate + }), + _ = multicall(NodesToEvict, evict_connections, [ConnEvictRate]), + {continue, NewData} + end. + +evict_sessions(#{donors := DonorNodes, recipients := RecipientNodes, opts := Opts} = Data) -> + DonorNodeCounts = multicall(DonorNodes, disconnected_session_counts, []), + {_, DonorCounts} = lists:unzip(DonorNodeCounts), + RecipientNodeCounts = multicall(RecipientNodes, disconnected_session_counts, []), + {_, RecipientCounts} = lists:unzip(RecipientNodeCounts), + + DonorAvg = avg(DonorCounts), + RecipientAvg = avg(RecipientCounts), + Thresholds = thresholds(sess, Opts), + NewData = Data#{ + donor_sess_avg => DonorAvg, + recipient_sess_avg => RecipientAvg, + donor_sess_counts => maps:from_list(DonorNodeCounts), + recipient_sess_counts => maps:from_list(RecipientNodeCounts) + }, + case within_thresholds(DonorAvg, RecipientAvg, Thresholds) of + true -> + ok; + false -> + SessEvictRate = maps:get(sess_evict_rate, Opts), + NodesToEvict = nodes_to_evict(RecipientAvg, DonorNodeCounts), + ?SLOG(warning, #{ + msg => "node_rebalance_evict_sessions", + nodes => NodesToEvict, + counts => SessEvictRate + }), + _ = multicall( + NodesToEvict, + evict_sessions, + [SessEvictRate, RecipientNodes, disconnected] + ), + {continue, NewData} + end. + +need_rebalance([] = _DonorNodes, _RecipientNodes, _ConnCounts, _SessCounts, _Opts) -> + false; +need_rebalance(_DonorNodes, [] = _RecipientNodes, _ConnCounts, _SessCounts, _Opts) -> + false; +need_rebalance(DonorNodes, RecipientNodes, ConnCounts, SessCounts, Opts) -> + DonorConnAvg = avg_for_nodes(DonorNodes, ConnCounts), + RecipientConnAvg = avg_for_nodes(RecipientNodes, ConnCounts), + DonorSessAvg = avg_for_nodes(DonorNodes, SessCounts), + RecipientSessAvg = avg_for_nodes(RecipientNodes, SessCounts), + Result = + (not within_thresholds(DonorConnAvg, RecipientConnAvg, thresholds(conn, Opts))) orelse + (not within_thresholds(DonorSessAvg, RecipientSessAvg, thresholds(sess, Opts))), + ?tp( + debug, + emqx_node_rebalance_need_rebalance, + #{ + donors => DonorNodes, + recipients => RecipientNodes, + conn_counts => ConnCounts, + sess_counts => SessCounts, + opts => Opts, + result => Result + } + ), + Result. + +avg_for_nodes(Nodes, Counts) -> + avg(maps:values(maps:with(Nodes, maps:from_list(Counts)))). + +within_thresholds(Value, GoalValue, {AbsThres, RelThres}) -> + (Value =< GoalValue + AbsThres) orelse (Value =< GoalValue * RelThres). + +thresholds(conn, #{abs_conn_threshold := Abs, rel_conn_threshold := Rel}) -> + {Abs, Rel}; +thresholds(sess, #{abs_sess_threshold := Abs, rel_sess_threshold := Rel}) -> + {Abs, Rel}. + +nodes_to_evict(Goal, NodeCounts) -> + {Nodes, _} = lists:unzip( + lists:filter( + fun({_Node, Count}) -> + Count > Goal + end, + NodeCounts + ) + ), + Nodes. + +get_stats(disabled, _Data) -> #{}; +get_stats(_State, Data) -> Data. + +avg(List) when length(List) >= 1 -> + lists:sum(List) / length(List). + +multicall(Nodes, F, A) -> + case apply(emqx_node_rebalance_proto_v1, F, [Nodes | A]) of + {Results, []} -> + case lists:partition(fun is_ok/1, lists:zip(Nodes, Results)) of + {OkResults, []} -> + [{Node, ok_result(Result)} || {Node, Result} <- OkResults]; + {_, BadResults} -> + error({bad_nodes, BadResults}) + end; + {_, [_BadNode | _] = BadNodes} -> + error({bad_nodes, BadNodes}) + end. + +is_ok({_Node, {ok, _}}) -> true; +is_ok({_Node, ok}) -> true; +is_ok(_) -> false. + +ok_result({ok, Result}) -> Result; +ok_result(ok) -> ok. + +connection_count() -> + {ok, emqx_eviction_agent:connection_count()}. + +session_count() -> + {ok, emqx_eviction_agent:session_count()}. + +disconnected_session_count() -> + {ok, emqx_eviction_agent:session_count(disconnected)}. + +default_opts() -> + #{ + conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE, + abs_conn_threshold => ?DEFAULT_ABS_CONN_THRESHOLD, + rel_conn_threshold => ?DEFAULT_REL_CONN_THRESHOLD, + + sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE, + abs_sess_threshold => ?DEFAULT_ABS_SESS_THRESHOLD, + rel_sess_threshold => ?DEFAULT_REL_SESS_THRESHOLD, + + wait_health_check => ?DEFAULT_WAIT_HEALTH_CHECK, + wait_takeover => ?DEFAULT_WAIT_TAKEOVER, + + evict_interval => ?EVICT_INTERVAL, + + nodes => all_nodes() + }. + +deinit(Data) -> + Keys = [ + recipient_conn_avg, + recipient_sess_avg, + donor_conn_avg, + donor_sess_avg, + recipient_conn_counts, + recipient_sess_counts, + donor_conn_counts, + donor_sess_counts, + initial_conn_counts, + initial_sess_counts, + opts + ], + maps:without(Keys, Data). + +is_node_available() -> + true = is_pid(whereis(emqx_node_rebalance_agent)), + disabled = emqx_eviction_agent:status(), + node(). + +all_nodes() -> + mria_mnesia:running_nodes(). + +seconds(Sec) -> + round(timer:seconds(Sec)). diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl new file mode 100644 index 000000000..47708d00e --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_agent.erl @@ -0,0 +1,131 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_agent). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). + +-include_lib("stdlib/include/qlc.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start_link/0, + enable/1, + disable/1, + status/0 +]). + +-export([ + init/1, + handle_call/3, + handle_info/2, + handle_cast/2, + code_change/3 +]). + +-define(ENABLE_KIND, emqx_node_rebalance). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-type status() :: {enabled, pid()} | disabled. + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec enable(pid()) -> ok_or_error(already_enabled | eviction_agent_busy). +enable(CoordinatorPid) -> + gen_server:call(?MODULE, {enable, CoordinatorPid}). + +-spec disable(pid()) -> ok_or_error(already_disabled | invalid_coordinator). +disable(CoordinatorPid) -> + gen_server:call(?MODULE, {disable, CoordinatorPid}). + +-spec status() -> status(). +status() -> + gen_server:call(?MODULE, status). + +%%-------------------------------------------------------------------- +%% gen_server callbacks +%%-------------------------------------------------------------------- + +init([]) -> + {ok, #{}}. + +handle_call({enable, CoordinatorPid}, _From, St) -> + case St of + #{coordinator_pid := _Pid} -> + {reply, {error, already_enabled}, St}; + _ -> + true = link(CoordinatorPid), + EvictionAgentPid = whereis(emqx_eviction_agent), + true = link(EvictionAgentPid), + case emqx_eviction_agent:enable(?ENABLE_KIND, undefined) of + ok -> + {reply, ok, #{ + coordinator_pid => CoordinatorPid, + eviction_agent_pid => EvictionAgentPid + }}; + {error, eviction_agent_busy} -> + true = unlink(EvictionAgentPid), + true = unlink(CoordinatorPid), + {reply, {error, eviction_agent_busy}, St} + end + end; +handle_call({disable, CoordinatorPid}, _From, St) -> + case St of + #{ + coordinator_pid := CoordinatorPid, + eviction_agent_pid := EvictionAgentPid + } -> + _ = emqx_eviction_agent:disable(?ENABLE_KIND), + true = unlink(EvictionAgentPid), + true = unlink(CoordinatorPid), + NewSt = maps:without( + [coordinator_pid, eviction_agent_pid], + St + ), + {reply, ok, NewSt}; + #{coordinator_pid := _CoordinatorPid} -> + {reply, {error, invalid_coordinator}, St}; + #{} -> + {reply, {error, already_disabled}, St} + end; +handle_call(status, _From, St) -> + case St of + #{coordinator_pid := Pid} -> + {reply, {enabled, Pid}, St}; + _ -> + {reply, disabled, St} + end; +handle_call(Msg, _From, St) -> + ?SLOG(warning, #{ + msg => "unknown_call", + call => Msg, + state => St + }), + {reply, ignored, St}. + +handle_info(Msg, St) -> + ?SLOG(warning, #{ + msg => "unknown_info", + info => Msg, + state => St + }), + {noreply, St}. + +handle_cast(Msg, St) -> + ?SLOG(warning, #{ + msg => "unknown_cast", + cast => Msg, + state => St + }), + {noreply, St}. + +code_change(_Vsn, State, _Extra) -> + {ok, State}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl new file mode 100644 index 000000000..fa322d146 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_api.erl @@ -0,0 +1,738 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- +-module(emqx_node_rebalance_api). + +-behaviour(minirest_api). + +-include_lib("typerefl/include/types.hrl"). +-include_lib("hocon/include/hoconsc.hrl"). +-include_lib("emqx/include/logger.hrl"). + +%% Swagger specs from hocon schema +-export([ + api_spec/0, + paths/0, + schema/1, + namespace/0 +]). + +-export([ + fields/1, + roots/0 +]). + +%% API callbacks +-export([ + '/load_rebalance/status'/2, + '/load_rebalance/global_status'/2, + '/load_rebalance/availability_check'/2, + '/load_rebalance/:node/start'/2, + '/load_rebalance/:node/stop'/2, + '/load_rebalance/:node/evacuation/start'/2, + '/load_rebalance/:node/evacuation/stop'/2 +]). + +%% Schema examples +-export([ + rebalance_example/0, + rebalance_evacuation_example/0, + translate/2 +]). + +-import(hoconsc, [mk/2, ref/1, ref/2]). +-import(emqx_dashboard_swagger, [error_codes/2]). + +-define(BAD_REQUEST, 'BAD_REQUEST'). +-define(NODE_UNAVAILABLE, 'NODE_UNAVAILABLE'). +-define(NODE_EVACUATING, 'NODE_EVACUATING'). +-define(RPC_ERROR, 'RPC_ERROR'). + +%%-------------------------------------------------------------------- +%% API Spec +%%-------------------------------------------------------------------- + +namespace() -> "load_rebalance". + +api_spec() -> + emqx_dashboard_swagger:spec(?MODULE, #{check_schema => true}). + +paths() -> + [ + "/load_rebalance/status", + "/load_rebalance/global_status", + "/load_rebalance/availability_check", + "/load_rebalance/:node/start", + "/load_rebalance/:node/stop", + "/load_rebalance/:node/evacuation/start", + "/load_rebalance/:node/evacuation/stop" + ]. + +schema("/load_rebalance/status") -> + #{ + 'operationId' => '/load_rebalance/status', + get => #{ + tags => [<<"load_rebalance">>], + summary => <<"Get rebalance status">>, + description => ?DESC("load_rebalance_status"), + responses => #{ + 200 => local_status_response_schema() + } + } + }; +schema("/load_rebalance/global_status") -> + #{ + 'operationId' => '/load_rebalance/global_status', + get => #{ + tags => [<<"load_rebalance">>], + summary => <<"Get global rebalance status">>, + description => ?DESC("load_rebalance_global_status"), + responses => #{ + 200 => response_schema() + } + } + }; +schema("/load_rebalance/availability_check") -> + #{ + 'operationId' => '/load_rebalance/availability_check', + get => #{ + tags => [<<"load_rebalance">>], + summary => <<"Node rebalance availability check">>, + description => ?DESC("load_rebalance_availability_check"), + responses => #{ + 200 => response_schema(), + 503 => error_codes([?NODE_EVACUATING], <<"Node Evacuating">>) + } + } + }; +schema("/load_rebalance/:node/start") -> + #{ + 'operationId' => '/load_rebalance/:node/start', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Start rebalancing with the node as coordinator">>, + description => ?DESC("load_rebalance_start"), + parameters => [param_node()], + 'requestBody' => + emqx_dashboard_swagger:schema_with_examples( + ref(rebalance_start), + rebalance_example() + ), + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }; +schema("/load_rebalance/:node/stop") -> + #{ + 'operationId' => '/load_rebalance/:node/stop', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Stop rebalancing coordinated by the node">>, + description => ?DESC("load_rebalance_stop"), + parameters => [param_node()], + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }; +schema("/load_rebalance/:node/evacuation/start") -> + #{ + 'operationId' => '/load_rebalance/:node/evacuation/start', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Start evacuation on a node">>, + description => ?DESC("load_rebalance_evacuation_start"), + parameters => [param_node()], + 'requestBody' => + emqx_dashboard_swagger:schema_with_examples( + ref(rebalance_evacuation_start), + rebalance_evacuation_example() + ), + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }; +schema("/load_rebalance/:node/evacuation/stop") -> + #{ + 'operationId' => '/load_rebalance/:node/evacuation/stop', + post => #{ + tags => [<<"load_rebalance">>], + summary => <<"Stop evacuation on a node">>, + description => ?DESC("load_rebalance_evacuation_stop"), + parameters => [param_node()], + responses => #{ + 200 => response_schema(), + 400 => error_codes([?BAD_REQUEST, ?NODE_UNAVAILABLE], <<"Bad Request">>) + } + } + }. + +%%-------------------------------------------------------------------- +%% Handlers +%%-------------------------------------------------------------------- + +'/load_rebalance/status'(get, #{}) -> + case emqx_node_rebalance_status:local_status() of + disabled -> + {200, #{status => disabled}}; + {rebalance, Stats} -> + {200, format_status(rebalance, Stats)}; + {evacuation, Stats} -> + {200, format_status(evacuation, Stats)} + end. + +'/load_rebalance/global_status'(get, #{}) -> + #{ + evacuations := Evacuations, + rebalances := Rebalances + } = emqx_node_rebalance_status:global_status(), + {200, #{ + evacuations => format_as_map_list(Evacuations), + rebalances => format_as_map_list(Rebalances) + }}. + +'/load_rebalance/availability_check'(get, #{}) -> + case emqx_eviction_agent:status() of + disabled -> + {200, #{}}; + {enabled, _Stats} -> + error_response(503, ?NODE_EVACUATING, <<"Node Evacuating">>) + end. + +'/load_rebalance/:node/start'(post, #{bindings := #{node := NodeBin}, body := Params0}) -> + with_node(NodeBin, fun(Node) -> + Params1 = translate(rebalance_start, Params0), + with_nodes_at_key(nodes, Params1, fun(Params2) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v1:node_rebalance_start(Node, Params2) + ) + end) + end). + +'/load_rebalance/:node/stop'(post, #{bindings := #{node := NodeBin}}) -> + with_node(NodeBin, fun(Node) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v1:node_rebalance_stop(Node) + ) + end). + +'/load_rebalance/:node/evacuation/start'(post, #{ + bindings := #{node := NodeBin}, body := Params0 +}) -> + with_node(NodeBin, fun(Node) -> + Params1 = translate(rebalance_evacuation_start, Params0), + with_nodes_at_key(migrate_to, Params1, fun(Params2) -> + wrap_rpc( + Node, + emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_start( + Node, Params2 + ) + ) + end) + end). + +'/load_rebalance/:node/evacuation/stop'(post, #{bindings := #{node := NodeBin}}) -> + with_node(NodeBin, fun(Node) -> + wrap_rpc( + Node, emqx_node_rebalance_api_proto_v1:node_rebalance_evacuation_stop(Node) + ) + end). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +wrap_rpc(Node, RPCResult) -> + case RPCResult of + ok -> + {200, #{}}; + {error, Reason} -> + error_response( + 400, ?BAD_REQUEST, io_lib:format("error on node ~p: ~p", [Node, Reason]) + ); + {badrpc, Reason} -> + error_response( + 503, ?RPC_ERROR, io_lib:format("RPC error on node ~p: ~p", [Node, Reason]) + ) + end. + +format_status(Process, Stats) -> + Stats#{process => Process, status => enabled}. + +validate_nodes(Key, Params) when is_map_key(Key, Params) -> + BinNodes = maps:get(Key, Params), + {ValidNodes, InvalidNodes} = lists:foldl( + fun(BinNode, {Nodes, UnknownNodes}) -> + case parse_node(BinNode) of + {ok, Node} -> {[Node | Nodes], UnknownNodes}; + {error, _} -> {Nodes, [BinNode | UnknownNodes]} + end + end, + {[], []}, + BinNodes + ), + case InvalidNodes of + [] -> + case emqx_node_rebalance_evacuation:available_nodes(ValidNodes) of + ValidNodes -> {ok, Params#{Key => ValidNodes}}; + OtherNodes -> {error, {unavailable, ValidNodes -- OtherNodes}} + end; + _ -> + {error, {invalid, InvalidNodes}} + end; +validate_nodes(_Key, Params) -> + {ok, Params}. + +with_node(BinNode, Fun) -> + case parse_node(BinNode) of + {ok, Node} -> Fun(Node); + {error, _} -> error_response(400, ?BAD_REQUEST, [<<"Invalid node: ">>, BinNode]) + end. + +with_nodes_at_key(Key, Params, Fun) -> + Res = validate_nodes(Key, Params), + case Res of + {ok, Params1} -> + Fun(Params1); + {error, {unavailable, Nodes}} -> + error_response(400, ?NODE_UNAVAILABLE, io_lib:format("Nodes unavailable: ~p", [Nodes])); + {error, {invalid, Nodes}} -> + error_response(400, ?BAD_REQUEST, io_lib:format("Invalid nodes: ~p", [Nodes])) + end. + +parse_node(Bin) when is_binary(Bin) -> + try + {ok, binary_to_existing_atom(Bin)} + catch + error:badarg -> + {error, {unknown, Bin}} + end. + +format_as_map_list(List) -> + lists:map( + fun({Node, Info}) -> + Info#{node => Node} + end, + List + ). + +error_response(HttpCode, Code, Message) -> + {HttpCode, #{ + code => atom_to_binary(Code), + message => iolist_to_binary(Message) + }}. + +without(Keys, Props) -> + lists:filter( + fun({Key, _}) -> + not lists:member(Key, Keys) + end, + Props + ). + +%%------------------------------------------------------------------------------ +%% Schema +%%------------------------------------------------------------------------------ + +translate(Ref, Conf) -> + Options = #{atom_key => true}, + #{Ref := TranslatedConf} = hocon_tconf:check_plain( + ?MODULE, #{atom_to_binary(Ref) => Conf}, Options, [Ref] + ), + TranslatedConf. + +param_node() -> + { + node, + mk(binary(), #{ + in => path, + desc => ?DESC(param_node), + required => true + }) + }. + +fields(rebalance_start) -> + [ + {"wait_health_check", + mk( + emqx_schema:duration_s(), + #{ + desc => ?DESC(wait_health_check), + required => false + } + )}, + {"conn_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(conn_evict_rate), + required => false + } + )}, + {"sess_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(sess_evict_rate), + required => false + } + )}, + {"abs_conn_threshold", + mk( + pos_integer(), + #{ + desc => ?DESC(abs_conn_threshold), + required => false + } + )}, + {"rel_conn_threshold", + mk( + number(), + #{ + desc => ?DESC(rel_conn_threshold), + required => false, + validator => [fun(Value) -> Value > 1.0 end] + } + )}, + {"abs_sess_threshold", + mk( + pos_integer(), + #{ + desc => ?DESC(abs_sess_threshold), + required => false + } + )}, + {"rel_sess_threshold", + mk( + number(), + #{ + desc => ?DESC(rel_sess_threshold), + required => false, + validator => [fun(Value) -> Value > 1.0 end] + } + )}, + {"wait_takeover", + mk( + emqx_schema:duration_s(), + #{ + desc => ?DESC(wait_takeover), + required => false + } + )}, + {"nodes", + mk( + list(binary()), + #{ + desc => ?DESC(rebalance_nodes), + required => false, + validator => [fun(Values) -> length(Values) > 0 end] + } + )} + ]; +fields(rebalance_evacuation_start) -> + [ + {"conn_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(conn_evict_rate), + required => false + } + )}, + {"sess_evict_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(sess_evict_rate), + required => false + } + )}, + {"redirect_to", + mk( + binary(), + #{ + desc => ?DESC(redirect_to), + required => false + } + )}, + {"wait_takeover", + mk( + pos_integer(), + #{ + desc => ?DESC(wait_takeover), + required => false + } + )}, + {"migrate_to", + mk( + list(binary()), + #{ + desc => ?DESC(migrate_to), + required => false, + validator => [fun(Values) -> length(Values) > 0 end] + } + )} + ]; +fields(local_status_disabled) -> + [ + {"status", + mk( + disabled, + #{ + desc => ?DESC(local_status_enabled), + required => true + } + )} + ]; +fields(local_status_enabled) -> + [ + {"status", + mk( + enabled, + #{ + desc => ?DESC(local_status_enabled), + required => true + } + )}, + {"process", + mk( + hoconsc:union([rebalance, evacuation]), + #{ + desc => ?DESC(local_status_process), + required => true + } + )}, + {"state", + mk( + atom(), + #{ + desc => ?DESC(local_status_state), + required => true + } + )}, + {"coordinator_node", + mk( + binary(), + #{ + desc => ?DESC(local_status_coordinator_node), + required => false + } + )}, + {"connection_eviction_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(local_status_connection_eviction_rate), + required => false + } + )}, + {"session_eviction_rate", + mk( + pos_integer(), + #{ + desc => ?DESC(local_status_session_eviction_rate), + required => false + } + )}, + {"connection_goal", + mk( + non_neg_integer(), + #{ + desc => ?DESC(local_status_connection_goal), + required => false + } + )}, + {"session_goal", + mk( + non_neg_integer(), + #{ + desc => ?DESC(local_status_session_goal), + required => false + } + )}, + {"disconnected_session_goal", + mk( + non_neg_integer(), + #{ + desc => ?DESC(local_status_disconnected_session_goal), + required => false + } + )}, + {"session_recipients", + mk( + list(binary()), + #{ + desc => ?DESC(local_status_session_recipients), + required => false + } + )}, + {"recipients", + mk( + list(binary()), + #{ + desc => ?DESC(local_status_recipients), + required => false + } + )}, + {"stats", + mk( + ref(status_stats), + #{ + desc => ?DESC(local_status_stats), + required => false + } + )} + ]; +fields(status_stats) -> + [ + {"initial_connected", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_initial_connected), + required => true + } + )}, + {"current_connected", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_current_connected), + required => true + } + )}, + {"initial_sessions", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_initial_sessions), + required => true + } + )}, + {"current_sessions", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_current_sessions), + required => true + } + )}, + {"current_disconnected_sessions", + mk( + non_neg_integer(), + #{ + desc => ?DESC(status_stats_current_disconnected_sessions), + required => false + } + )} + ]; +fields(global_coordinator_status) -> + without( + ["status", "process", "session_goal", "session_recipients", "stats"], + fields(local_status_enabled) + ) ++ + [ + {"donors", + mk( + list(binary()), + #{ + desc => ?DESC(coordinator_status_donors), + required => false + } + )}, + {"donor_conn_avg", + mk( + non_neg_integer(), + #{ + desc => ?DESC(coordinator_status_donor_conn_avg), + required => false + } + )}, + {"donor_sess_avg", + mk( + non_neg_integer(), + #{ + desc => ?DESC(coordinator_status_donor_sess_avg), + required => false + } + )}, + {"node", + mk( + binary(), + #{ + desc => ?DESC(coordinator_status_node), + required => true + } + )} + ]; +fields(global_evacuation_status) -> + without(["status", "process"], fields(local_status_enabled)) ++ + [ + {"node", + mk( + binary(), + #{ + desc => ?DESC(evacuation_status_node), + required => true + } + )} + ]; +fields(global_status) -> + [ + {"evacuations", + mk( + hoconsc:array(ref(global_evacuation_status)), + #{ + desc => ?DESC(global_status_evacuations), + required => false + } + )}, + {"rebalances", + mk( + hoconsc:array(ref(global_coordinator_status)), + #{ + desc => ?DESC(global_status_rebalances), + required => false + } + )} + ]. + +rebalance_example() -> + #{ + wait_health_check => 10, + conn_evict_rate => 10, + sess_evict_rate => 20, + abs_conn_threshold => 10, + rel_conn_threshold => 1.5, + abs_sess_threshold => 10, + rel_sess_threshold => 1.5, + wait_takeover => 10, + nodes => [<<"othernode@127.0.0.1">>] + }. + +rebalance_evacuation_example() -> + #{ + conn_evict_rate => 100, + sess_evict_rate => 100, + redirect_to => <<"othernode:1883">>, + wait_takeover => 10, + migrate_to => [<<"othernode@127.0.0.1">>] + }. + +local_status_response_schema() -> + hoconsc:union([ref(local_status_disabled), ref(local_status_enabled)]). + +response_schema() -> + mk( + map(), + #{ + desc => ?DESC(empty_response) + } + ). + +roots() -> []. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl new file mode 100644 index 000000000..3cd59e0f4 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_app.erl @@ -0,0 +1,22 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_app). + +-behaviour(application). + +-emqx_plugin(?MODULE). + +-export([ + start/2, + stop/1 +]). + +start(_Type, _Args) -> + {ok, Sup} = emqx_node_rebalance_sup:start_link(), + ok = emqx_node_rebalance_cli:load(), + {ok, Sup}. + +stop(_State) -> + emqx_node_rebalance_cli:unload(). diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl new file mode 100644 index 000000000..a2706f13b --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_cli.erl @@ -0,0 +1,305 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_cli). + +%% APIs +-export([ + load/0, + unload/0, + cli/1 +]). + +load() -> + emqx_ctl:register_command(rebalance, {?MODULE, cli}, []). + +unload() -> + emqx_ctl:unregister_command(rebalance). + +cli(["start" | StartArgs]) -> + case start_args(StartArgs) of + {evacuation, Opts} -> + case emqx_node_rebalance_evacuation:status() of + disabled -> + ok = emqx_node_rebalance_evacuation:start(Opts), + emqx_ctl:print("Rebalance(evacuation) started~n"), + true; + {enabled, _} -> + emqx_ctl:print("Rebalance is already enabled~n"), + false + end; + {rebalance, Opts} -> + case emqx_node_rebalance:start(Opts) of + ok -> + emqx_ctl:print("Rebalance started~n"), + true; + {error, Reason} -> + emqx_ctl:print("Rebalance start error: ~p~n", [Reason]), + false + end; + {error, Error} -> + emqx_ctl:print("Rebalance start error: ~s~n", [Error]), + false + end; +cli(["node-status", NodeStr]) -> + case emqx_misc:safe_to_existing_atom(NodeStr, utf8) of + {ok, Node} -> + node_status(emqx_node_rebalance_status:local_status(Node)); + {error, _} -> + emqx_ctl:print("Node status error: invalid node~n"), + false + end; +cli(["node-status"]) -> + node_status(emqx_node_rebalance_status:local_status()); +cli(["status"]) -> + #{ + evacuations := Evacuations, + rebalances := Rebalances + } = emqx_node_rebalance_status:global_status(), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print( + "--------------------------------------------------------------------~n" + ), + emqx_ctl:print( + "Node ~p: evacuation~n~s", + [Node, emqx_node_rebalance_status:format_local_status(Status)] + ) + end, + Evacuations + ), + lists:foreach( + fun({Node, Status}) -> + emqx_ctl:print( + "--------------------------------------------------------------------~n" + ), + emqx_ctl:print( + "Node ~p: rebalance coordinator~n~s", + [Node, emqx_node_rebalance_status:format_coordinator_status(Status)] + ) + end, + Rebalances + ); +cli(["stop"]) -> + case emqx_node_rebalance_evacuation:status() of + {enabled, _} -> + ok = emqx_node_rebalance_evacuation:stop(), + emqx_ctl:print("Rebalance(evacuation) stopped~n"), + true; + disabled -> + case emqx_node_rebalance:status() of + {enabled, _} -> + ok = emqx_node_rebalance:stop(), + emqx_ctl:print("Rebalance stopped~n"), + true; + disabled -> + emqx_ctl:print("Rebalance is already disabled~n"), + false + end + end; +cli(_) -> + emqx_ctl:usage( + [ + { + "rebalance start --evacuation \\\n" + " [--redirect-to \"Host1:Port1 Host2:Port2 ...\"] \\\n" + " [--conn-evict-rate CountPerSec] \\\n" + " [--migrate-to \"node1@host1 node2@host2 ...\"] \\\n" + " [--wait-takeover Secs] \\\n" + " [--sess-evict-rate CountPerSec]", + "Start current node evacuation with optional server redirect to the specified servers" + }, + + { + "rebalance start \\\n" + " [--nodes \"node1@host1 node2@host2\"] \\\n" + " [--wait-health-check Secs] \\\n" + " [--conn-evict-rate ConnPerSec] \\\n" + " [--abs-conn-threshold Count] \\\n" + " [--rel-conn-threshold Fraction] \\\n" + " [--conn-evict-rate ConnPerSec] \\\n" + " [--wait-takeover Secs] \\\n" + " [--sess-evict-rate CountPerSec] \\\n" + " [--abs-sess-threshold Count] \\\n" + " [--rel-sess-threshold Fraction]", + "Start rebalance on the specified nodes using the current node as the coordinator" + }, + + {"rebalance node-status", "Get current node rebalance status"}, + + {"rebalance node-status \"node1@host1\"", "Get remote node rebalance status"}, + + {"rebalance status", + "Get statuses of all current rebalance/evacuation processes across the cluster"}, + + {"rebalance stop", "Stop node rebalance"} + ] + ). + +node_status(NodeStatus) -> + case NodeStatus of + {Process, Status} when Process =:= evacuation orelse Process =:= rebalance -> + emqx_ctl:print( + "Rebalance type: ~p~n~s~n", + [Process, emqx_node_rebalance_status:format_local_status(Status)] + ); + disabled -> + emqx_ctl:print("Rebalance disabled~n"); + Other -> + emqx_ctl:print("Error detecting rebalance status: ~p~n", [Other]) + end. + +start_args(Args) -> + case collect_args(Args, #{}) of + {ok, #{"--evacuation" := true} = Collected} -> + case validate_evacuation(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {evacuation, Validated}; + {error, _} = Error -> + Error + end; + {ok, #{} = Collected} -> + case validate_rebalance(maps:to_list(Collected), #{}) of + {ok, Validated} -> + {rebalance, Validated}; + {error, _} = Error -> + Error + end; + {error, _} = Error -> + Error + end. + +collect_args([], Map) -> + {ok, Map}; +%% evacuation +collect_args(["--evacuation" | Args], Map) -> + collect_args(Args, Map#{"--evacuation" => true}); +collect_args(["--redirect-to", ServerReference | Args], Map) -> + collect_args(Args, Map#{"--redirect-to" => ServerReference}); +collect_args(["--migrate-to", MigrateTo | Args], Map) -> + collect_args(Args, Map#{"--migrate-to" => MigrateTo}); +%% rebalance +collect_args(["--nodes", Nodes | Args], Map) -> + collect_args(Args, Map#{"--nodes" => Nodes}); +collect_args(["--wait-health-check", WaitHealthCheck | Args], Map) -> + collect_args(Args, Map#{"--wait-health-check" => WaitHealthCheck}); +collect_args(["--abs-conn-threshold", AbsConnThres | Args], Map) -> + collect_args(Args, Map#{"--abs-conn-threshold" => AbsConnThres}); +collect_args(["--rel-conn-threshold", RelConnThres | Args], Map) -> + collect_args(Args, Map#{"--rel-conn-threshold" => RelConnThres}); +collect_args(["--abs-sess-threshold", AbsSessThres | Args], Map) -> + collect_args(Args, Map#{"--abs-sess-threshold" => AbsSessThres}); +collect_args(["--rel-sess-threshold", RelSessThres | Args], Map) -> + collect_args(Args, Map#{"--rel-sess-threshold" => RelSessThres}); +%% common +collect_args(["--conn-evict-rate", ConnEvictRate | Args], Map) -> + collect_args(Args, Map#{"--conn-evict-rate" => ConnEvictRate}); +collect_args(["--wait-takeover", WaitTakeover | Args], Map) -> + collect_args(Args, Map#{"--wait-takeover" => WaitTakeover}); +collect_args(["--sess-evict-rate", SessEvictRate | Args], Map) -> + collect_args(Args, Map#{"--sess-evict-rate" => SessEvictRate}); +%% fallback +collect_args(Args, _Map) -> + {error, io_lib:format("unknown arguments: ~p", [Args])}. + +validate_evacuation([], Map) -> + {ok, Map}; +validate_evacuation([{"--evacuation", _} | Rest], Map) -> + validate_evacuation(Rest, Map); +validate_evacuation([{"--redirect-to", ServerReference} | Rest], Map) -> + validate_evacuation(Rest, Map#{server_reference => list_to_binary(ServerReference)}); +validate_evacuation([{"--conn-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(conn_evict_rate, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--sess-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(sess_evict_rate, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--wait-takeover", _} | _] = Opts, Map) -> + validate_pos_int(wait_takeover, Opts, Map, fun validate_evacuation/2); +validate_evacuation([{"--migrate-to", MigrateTo} | Rest], Map) -> + case strings_to_atoms(string:tokens(MigrateTo, ", ")) of + {_, Invalid} when Invalid =/= [] -> + {error, io_lib:format("invalid --migrate-to, invalid nodes: ~p", [Invalid])}; + {Nodes, []} -> + case emqx_node_rebalance_evacuation:available_nodes(Nodes) of + [] -> + {error, "invalid --migrate-to, no nodes"}; + Nodes -> + validate_evacuation(Rest, Map#{migrate_to => Nodes}); + OtherNodes -> + {error, + io_lib:format( + "invalid --migrate-to, unavailable nodes: ~p", + [Nodes -- OtherNodes] + )} + end + end; +validate_evacuation(Rest, _Map) -> + {error, io_lib:format("unknown evacuation arguments: ~p", [Rest])}. + +validate_rebalance([], Map) -> + {ok, Map}; +validate_rebalance([{"--wait-health-check", _} | _] = Opts, Map) -> + validate_pos_int(wait_health_check, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--conn-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(conn_evict_rate, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--sess-evict-rate", _} | _] = Opts, Map) -> + validate_pos_int(sess_evict_rate, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--abs-conn-threshold", _} | _] = Opts, Map) -> + validate_pos_int(abs_conn_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--rel-conn-threshold", _} | _] = Opts, Map) -> + validate_fraction(rel_conn_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--abs-sess-threshold", _} | _] = Opts, Map) -> + validate_pos_int(abs_sess_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--rel-sess-threshold", _} | _] = Opts, Map) -> + validate_fraction(rel_sess_threshold, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--wait-takeover", _} | _] = Opts, Map) -> + validate_pos_int(wait_takeover, Opts, Map, fun validate_rebalance/2); +validate_rebalance([{"--nodes", NodeStr} | Rest], Map) -> + case strings_to_atoms(string:tokens(NodeStr, ", ")) of + {_, Invalid} when Invalid =/= [] -> + {error, io_lib:format("invalid --nodes, invalid nodes: ~p", [Invalid])}; + {Nodes, []} -> + case emqx_node_rebalance:available_nodes(Nodes) of + [] -> + {error, "invalid --nodes, no nodes"}; + Nodes -> + validate_rebalance(Rest, Map#{nodes => Nodes}); + OtherNodes -> + {error, + io_lib:format( + "invalid --nodes, unavailable nodes: ~p", + [Nodes -- OtherNodes] + )} + end + end; +validate_rebalance(Rest, _Map) -> + {error, io_lib:format("unknown rebalance arguments: ~p", [Rest])}. + +validate_fraction(Name, [{OptionName, Value} | Rest], Map, Next) -> + case string:to_float(Value) of + {Num, ""} when Num > 1.0 -> + Next(Rest, Map#{Name => Num}); + _ -> + {error, "invalid " ++ OptionName ++ " value"} + end. + +validate_pos_int(Name, [{OptionName, Value} | Rest], Map, Next) -> + case string:to_integer(Value) of + {Int, ""} when Int > 0 -> + Next(Rest, Map#{Name => Int}); + _ -> + {error, "invalid " ++ OptionName ++ " value"} + end. + +strings_to_atoms(Strings) -> + strings_to_atoms(Strings, [], []). + +strings_to_atoms([], Atoms, Invalid) -> + {lists:reverse(Atoms), lists:reverse(Invalid)}; +strings_to_atoms([Str | Rest], Atoms, Invalid) -> + case emqx_misc:safe_to_existing_atom(Str, utf8) of + {ok, Atom} -> + strings_to_atoms(Rest, [Atom | Atoms], Invalid); + {error, _} -> + strings_to_atoms(Rest, Atoms, [Str | Invalid]) + end. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl new file mode 100644 index 000000000..4de362ca9 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation.erl @@ -0,0 +1,308 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation). + +-include("emqx_node_rebalance.hrl"). + +-include_lib("emqx/include/logger.hrl"). +-include_lib("emqx/include/types.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-export([ + start/1, + status/0, + stop/0 +]). + +-export([start_link/0]). + +-behaviour(gen_statem). + +-export([ + init/1, + callback_mode/0, + handle_event/4, + code_change/4 +]). + +-export([ + is_node_available/0, + available_nodes/1 +]). + +-export_type([ + start_opts/0, + start_error/0 +]). + +-ifdef(TEST). +-export([migrate_to/1]). +-endif. + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-define(EVICT_INTERVAL_NO_NODES, 30000). + +-type migrate_to() :: [node()] | undefined. + +-type start_opts() :: #{ + server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer(), + migrate_to => migrate_to() +}. +-type start_error() :: already_started | eviction_agent_busy. +-type stats() :: #{ + initial_conns := non_neg_integer(), + initial_sessions := non_neg_integer(), + current_conns := non_neg_integer(), + current_sessions := non_neg_integer(), + conn_evict_rate := pos_integer(), + sess_evict_rate := pos_integer(), + server_reference := emqx_eviction_agent:server_reference(), + migrate_to := migrate_to() +}. +-type status() :: {enabled, stats()} | disabled. + +-spec start(start_opts()) -> ok_or_error(start_error()). +start(StartOpts) -> + Opts = maps:merge(default_opts(), StartOpts), + gen_statem:call(?MODULE, {start, Opts}). + +-spec stop() -> ok_or_error(not_started). +stop() -> + gen_statem:call(?MODULE, stop). + +-spec status() -> status(). +status() -> + gen_statem:call(?MODULE, status). + +-spec start_link() -> startlink_ret(). +start_link() -> + gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). + +-spec available_nodes(list(node())) -> list(node()). +available_nodes(Nodes) when is_list(Nodes) -> + {Available, _} = emqx_node_rebalance_evacuation_proto_v1:available_nodes(Nodes), + lists:filter(fun is_atom/1, Available). + +%%-------------------------------------------------------------------- +%% gen_statem callbacks +%%-------------------------------------------------------------------- + +callback_mode() -> handle_event_function. + +%% states: disabled, evicting_conns, waiting_takeover, evicting_sessions, prohibiting + +init([]) -> + case emqx_node_rebalance_evacuation_persist:read(default_opts()) of + {ok, #{server_reference := ServerReference} = Opts} -> + ?SLOG(warning, #{msg => "restoring_evacuation_state", opts => Opts}), + case emqx_eviction_agent:enable(?MODULE, ServerReference) of + ok -> + Data = init_data(#{}, Opts), + ok = warn_enabled(), + {ok, evicting_conns, Data, [{state_timeout, 0, evict_conns}]}; + {error, eviction_agent_busy} -> + emqx_node_rebalance_evacuation_persist:clear(), + {ok, disabled, #{}} + end; + none -> + {ok, disabled, #{}} + end. + +%% start +handle_event( + {call, From}, + {start, #{server_reference := ServerReference} = Opts}, + disabled, + #{} = Data +) -> + case emqx_eviction_agent:enable(?MODULE, ServerReference) of + ok -> + NewData = init_data(Data, Opts), + ok = emqx_node_rebalance_evacuation_persist:save(Opts), + ?SLOG(warning, #{ + msg => "node_evacuation_started", + opts => Opts + }), + {next_state, evicting_conns, NewData, [ + {state_timeout, 0, evict_conns}, + {reply, From, ok} + ]}; + {error, eviction_agent_busy} -> + {keep_state_and_data, [{reply, From, {error, eviction_agent_busy}}]} + end; +handle_event({call, From}, {start, _Opts}, _State, #{}) -> + {keep_state_and_data, [{reply, From, {error, already_started}}]}; +%% stop +handle_event({call, From}, stop, disabled, #{}) -> + {keep_state_and_data, [{reply, From, {error, not_started}}]}; +handle_event({call, From}, stop, _State, Data) -> + ok = emqx_node_rebalance_evacuation_persist:clear(), + _ = emqx_eviction_agent:disable(?MODULE), + ?SLOG(warning, #{msg => "node_evacuation_stopped"}), + {next_state, disabled, deinit(Data), [{reply, From, ok}]}; +%% status +handle_event({call, From}, status, disabled, #{}) -> + {keep_state_and_data, [{reply, From, disabled}]}; +handle_event({call, From}, status, State, #{migrate_to := MigrateTo} = Data) -> + Stats = maps:with( + [ + initial_conns, + current_conns, + initial_sessions, + current_sessions, + server_reference, + conn_evict_rate, + sess_evict_rate + ], + Data + ), + {keep_state_and_data, [ + {reply, From, {enabled, Stats#{state => State, migrate_to => migrate_to(MigrateTo)}}} + ]}; +%% conn eviction +handle_event( + state_timeout, + evict_conns, + evicting_conns, + #{ + conn_evict_rate := ConnEvictRate, + wait_takeover := WaitTakeover + } = Data +) -> + case emqx_eviction_agent:status() of + {enabled, #{connections := Conns}} when Conns > 0 -> + ok = emqx_eviction_agent:evict_connections(ConnEvictRate), + ?tp(debug, node_evacuation_evict_conn, #{conn_evict_rate => ConnEvictRate}), + ?SLOG( + warning, + #{ + msg => "node_evacuation_evict_conns", + count => Conns, + conn_evict_rate => ConnEvictRate + } + ), + NewData = Data#{current_conns => Conns}, + {keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_conns}]}; + {enabled, #{connections := 0}} -> + NewData = Data#{current_conns => 0}, + ?SLOG(warning, #{msg => "node_evacuation_evict_conns_done"}), + {next_state, waiting_takeover, NewData, [ + {state_timeout, timer:seconds(WaitTakeover), evict_sessions} + ]} + end; +handle_event( + state_timeout, + evict_sessions, + waiting_takeover, + Data +) -> + ?SLOG(warning, #{msg => "node_evacuation_waiting_takeover_done"}), + {next_state, evicting_sessions, Data, [{state_timeout, 0, evict_sessions}]}; +%% session eviction +handle_event( + state_timeout, + evict_sessions, + evicting_sessions, + #{ + sess_evict_rate := SessEvictRate, + migrate_to := MigrateTo, + current_sessions := CurrSessCount + } = Data +) -> + case emqx_eviction_agent:status() of + {enabled, #{sessions := SessCount}} when SessCount > 0 -> + case migrate_to(MigrateTo) of + [] -> + ?SLOG(warning, #{ + msg => "no_nodes_to_evacuate_sessions", session_count => CurrSessCount + }), + {keep_state_and_data, [ + {state_timeout, ?EVICT_INTERVAL_NO_NODES, evict_sessions} + ]}; + Nodes -> + ok = emqx_eviction_agent:evict_sessions(SessEvictRate, Nodes), + ?SLOG( + warning, + #{ + msg => "node_evacuation_evict_sessions", + session_count => SessCount, + session_evict_rate => SessEvictRate, + target_nodes => Nodes + } + ), + NewData = Data#{current_sessions => SessCount}, + {keep_state, NewData, [{state_timeout, ?EVICT_INTERVAL, evict_sessions}]} + end; + {enabled, #{sessions := 0}} -> + ?tp(debug, node_evacuation_evict_sess_over, #{}), + ?SLOG(warning, #{msg => "node_evacuation_evict_sessions_over"}), + NewData = Data#{current_sessions => 0}, + {next_state, prohibiting, NewData} + end; +handle_event({call, From}, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_call", call => Msg, state => State, data => Data}), + {keep_state_and_data, [{reply, From, ignored}]}; +handle_event(info, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_info", info => Msg, state => State, data => Data}), + keep_state_and_data; +handle_event(cast, Msg, State, Data) -> + ?SLOG(warning, #{msg => "unknown_cast", cast => Msg, state => State, data => Data}), + keep_state_and_data. + +code_change(_Vsn, State, Data, _Extra) -> + {ok, State, Data}. + +%%-------------------------------------------------------------------- +%% internal funs +%%-------------------------------------------------------------------- + +default_opts() -> + #{ + server_reference => undefined, + conn_evict_rate => ?DEFAULT_CONN_EVICT_RATE, + sess_evict_rate => ?DEFAULT_SESS_EVICT_RATE, + wait_takeover => ?DEFAULT_WAIT_TAKEOVER, + migrate_to => undefined + }. + +init_data(Data0, Opts) -> + Data1 = maps:merge(Data0, Opts), + {enabled, #{connections := ConnCount, sessions := SessCount}} = emqx_eviction_agent:status(), + Data1#{ + initial_conns => ConnCount, + current_conns => ConnCount, + initial_sessions => SessCount, + current_sessions => SessCount + }. + +deinit(Data) -> + Keys = + [initial_conns, current_conns, initial_sessions, current_sessions] ++ + maps:keys(default_opts()), + maps:without(Keys, Data). + +warn_enabled() -> + ?SLOG(warning, #{msg => "node_evacuation_enabled"}), + io:format( + standard_error, "Node evacuation is enabled. The node will not receive connections.~n", [] + ). + +migrate_to(undefined) -> + migrate_to(all_nodes()); +migrate_to(Nodes) when is_list(Nodes) -> + available_nodes(Nodes). + +is_node_available() -> + disabled = emqx_eviction_agent:status(), + node(). + +all_nodes() -> + mria_mnesia:running_nodes() -- [node()]. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl new file mode 100644 index 000000000..3fc9faeea --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_evacuation_persist.erl @@ -0,0 +1,120 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_persist). + +-export([ + save/1, + clear/0, + read/1 +]). + +-ifdef(TEST). +-export([evacuation_filepath/0]). +-endif. + +-include("emqx_node_rebalance.hrl"). +-include_lib("emqx/include/types.hrl"). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +%% do not persist `migrate_to`: +%% * after restart there is nothing to migrate +%% * this value may be invalid after node was offline +-type persisted_start_opts() :: #{ + server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer() +}. +-type start_opts() :: #{ + server_reference => emqx_eviction_agent:server_reference(), + conn_evict_rate => pos_integer(), + sess_evict_rate => pos_integer(), + wait_takeover => pos_integer(), + migrate_to => emqx_node_rebalance_evacuation:migrate_to() +}. + +-spec save(persisted_start_opts()) -> ok_or_error(term()). +save( + #{ + server_reference := ServerReference, + conn_evict_rate := ConnEvictRate, + sess_evict_rate := SessEvictRate, + wait_takeover := WaitTakeover + } = Data +) when + (is_binary(ServerReference) orelse ServerReference =:= undefined) andalso + is_integer(ConnEvictRate) andalso ConnEvictRate > 0 andalso + is_integer(SessEvictRate) andalso SessEvictRate > 0 andalso + is_integer(WaitTakeover) andalso WaitTakeover >= 0 +-> + Filepath = evacuation_filepath(), + case filelib:ensure_dir(Filepath) of + ok -> + JsonData = emqx_json:encode( + prepare_for_encode(maps:with(persist_keys(), Data)), + [pretty] + ), + file:write_file(Filepath, JsonData); + {error, _} = Error -> + Error + end. + +-spec clear() -> ok. +clear() -> + file:delete(evacuation_filepath()). + +-spec read(start_opts()) -> {ok, start_opts()} | none. +read(DefaultOpts) -> + case file:read_file(evacuation_filepath()) of + {ok, Data} -> + case emqx_json:safe_decode(Data, [return_maps]) of + {ok, Map} when is_map(Map) -> + {ok, map_to_opts(DefaultOpts, Map)}; + _NotAMap -> + {ok, DefaultOpts} + end; + {error, _} -> + none + end. + +%%-------------------------------------------------------------------- +%% Internal funcs +%%-------------------------------------------------------------------- + +persist_keys() -> + [ + server_reference, + conn_evict_rate, + sess_evict_rate, + wait_takeover + ]. + +prepare_for_encode(#{server_reference := undefined} = Data) -> + Data#{server_reference => null}; +prepare_for_encode(Data) -> + Data. + +format_after_decode(#{server_reference := null} = Data) -> + Data#{server_reference => undefined}; +format_after_decode(Data) -> + Data. + +map_to_opts(DefaultOpts, Map) -> + format_after_decode( + map_to_opts( + maps:to_list(DefaultOpts), Map, #{} + ) + ). + +map_to_opts([], _Map, Opts) -> + Opts; +map_to_opts([{Key, DefaultVal} | Rest], Map, Opts) -> + map_to_opts(Rest, Map, Opts#{Key => maps:get(atom_to_binary(Key), Map, DefaultVal)}). + +evacuation_filepath() -> + filename:join([emqx:data_dir(), ?EVACUATION_FILENAME]). diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl new file mode 100644 index 000000000..63675a3da --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_status.erl @@ -0,0 +1,238 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_status). + +-export([ + local_status/0, + local_status/1, + global_status/0, + format_local_status/1, + format_coordinator_status/1 +]). + +%% For RPC +-export([ + evacuation_status/0, + rebalance_status/0 +]). + +%%-------------------------------------------------------------------- +%% APIs +%%-------------------------------------------------------------------- + +-spec local_status() -> disabled | {evacuation, map()} | {rebalance, map()}. +local_status() -> + case emqx_node_rebalance_evacuation:status() of + {enabled, Status} -> + {evacuation, evacuation(Status)}; + disabled -> + case emqx_node_rebalance_agent:status() of + {enabled, CoordinatorPid} -> + case emqx_node_rebalance:status(CoordinatorPid) of + {enabled, Status} -> + local_rebalance(Status, node()); + disabled -> + disabled + end; + disabled -> + disabled + end + end. + +-spec local_status(node()) -> disabled | {evacuation, map()} | {rebalance, map()}. +local_status(Node) -> + emqx_node_rebalance_status_proto_v1:local_status(Node). + +-spec format_local_status(map()) -> iodata(). +format_local_status(Status) -> + format_status(Status, local_status_field_format_order()). + +-spec global_status() -> #{rebalances := [{node(), map()}], evacuations := [{node(), map()}]}. +global_status() -> + Nodes = mria_mnesia:running_nodes(), + {RebalanceResults, _} = emqx_node_rebalance_status_proto_v1:rebalance_status(Nodes), + Rebalances = [ + {Node, coordinator_rebalance(Status)} + || {Node, {enabled, Status}} <- RebalanceResults + ], + {EvacuatioResults, _} = emqx_node_rebalance_status_proto_v1:evacuation_status(Nodes), + Evacuations = [{Node, evacuation(Status)} || {Node, {enabled, Status}} <- EvacuatioResults], + #{rebalances => Rebalances, evacuations => Evacuations}. + +-spec format_coordinator_status(map()) -> iodata(). +format_coordinator_status(Status) -> + format_status(Status, coordinator_status_field_format_order()). + +%%-------------------------------------------------------------------- +%% Internal functions +%%-------------------------------------------------------------------- + +evacuation(Status) -> + #{ + state => maps:get(state, Status), + connection_eviction_rate => maps:get(conn_evict_rate, Status), + session_eviction_rate => maps:get(sess_evict_rate, Status), + connection_goal => 0, + session_goal => 0, + session_recipients => maps:get(migrate_to, Status), + stats => #{ + initial_connected => maps:get(initial_conns, Status), + current_connected => maps:get(current_conns, Status), + initial_sessions => maps:get(initial_sessions, Status), + current_sessions => maps:get(current_sessions, Status) + } + }. + +local_rebalance(#{donors := Donors} = Stats, Node) -> + case lists:member(Node, Donors) of + true -> {rebalance, donor_rebalance(Stats, Node)}; + false -> disabled + end. + +donor_rebalance(Status, Node) -> + Opts = maps:get(opts, Status), + InitialConnCounts = maps:get(initial_conn_counts, Status), + InitialSessCounts = maps:get(initial_sess_counts, Status), + + CurrentStats = #{ + initial_connected => maps:get(Node, InitialConnCounts), + initial_sessions => maps:get(Node, InitialSessCounts), + current_connected => emqx_eviction_agent:connection_count(), + current_sessions => emqx_eviction_agent:session_count(), + current_disconnected_sessions => emqx_eviction_agent:session_count( + disconnected + ) + }, + maps:from_list( + [ + {state, maps:get(state, Status)}, + {coordinator_node, maps:get(coordinator_node, Status)}, + {connection_eviction_rate, maps:get(conn_evict_rate, Opts)}, + {session_eviction_rate, maps:get(sess_evict_rate, Opts)}, + {recipients, maps:get(recipients, Status)}, + {stats, CurrentStats} + ] ++ + [ + {connection_goal, maps:get(recipient_conn_avg, Status)} + || maps:is_key(recipient_conn_avg, Status) + ] ++ + [ + {disconnected_session_goal, maps:get(recipient_sess_avg, Status)} + || maps:is_key(recipient_sess_avg, Status) + ] + ). + +coordinator_rebalance(Status) -> + Opts = maps:get(opts, Status), + maps:from_list( + [ + {state, maps:get(state, Status)}, + {coordinator_node, maps:get(coordinator_node, Status)}, + {connection_eviction_rate, maps:get(conn_evict_rate, Opts)}, + {session_eviction_rate, maps:get(sess_evict_rate, Opts)}, + {recipients, maps:get(recipients, Status)}, + {donors, maps:get(donors, Status)} + ] ++ + [ + {connection_goal, maps:get(recipient_conn_avg, Status)} + || maps:is_key(recipient_conn_avg, Status) + ] ++ + [ + {disconnected_session_goal, maps:get(recipient_sess_avg, Status)} + || maps:is_key(recipient_sess_avg, Status) + ] ++ + [ + {donor_conn_avg, maps:get(donor_conn_avg, Status)} + || maps:is_key(donor_conn_avg, Status) + ] ++ + [ + {donor_sess_avg, maps:get(donor_sess_avg, Status)} + || maps:is_key(donor_sess_avg, Status) + ] + ). + +local_status_field_format_order() -> + [ + state, + coordinator_node, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + session_goal, + disconnected_session_goal, + session_recipients, + recipients, + stats + ]. + +coordinator_status_field_format_order() -> + [ + state, + coordinator_node, + donors, + recipients, + connection_eviction_rate, + session_eviction_rate, + connection_goal, + disconnected_session_goal, + donor_conn_avg, + donor_sess_avg + ]. + +format_status(Status, FieldOrder) -> + Fields = lists:flatmap( + fun(FieldName) -> + maps:to_list(maps:with([FieldName], Status)) + end, + FieldOrder + ), + lists:map( + fun format_local_status_field/1, + Fields + ). + +format_local_status_field({state, State}) -> + io_lib:format("Rebalance state: ~p~n", [State]); +format_local_status_field({coordinator_node, Node}) -> + io_lib:format("Coordinator node: ~p~n", [Node]); +format_local_status_field({connection_eviction_rate, ConnEvictRate}) -> + io_lib:format("Connection eviction rate: ~p connections/second~n", [ConnEvictRate]); +format_local_status_field({session_eviction_rate, SessEvictRate}) -> + io_lib:format("Session eviction rate: ~p sessions/second~n", [SessEvictRate]); +format_local_status_field({connection_goal, ConnGoal}) -> + io_lib:format("Connection goal: ~p~n", [ConnGoal]); +format_local_status_field({session_goal, SessGoal}) -> + io_lib:format("Session goal: ~p~n", [SessGoal]); +format_local_status_field({disconnected_session_goal, DisconnSessGoal}) -> + io_lib:format("Disconnected session goal: ~p~n", [DisconnSessGoal]); +format_local_status_field({session_recipients, SessionRecipients}) -> + io_lib:format("Session recipient nodes: ~p~n", [SessionRecipients]); +format_local_status_field({recipients, Recipients}) -> + io_lib:format("Recipient nodes: ~p~n", [Recipients]); +format_local_status_field({donors, Donors}) -> + io_lib:format("Donor nodes: ~p~n", [Donors]); +format_local_status_field({donor_conn_avg, DonorConnAvg}) -> + io_lib:format("Current average donor node connection count: ~p~n", [DonorConnAvg]); +format_local_status_field({donor_sess_avg, DonorSessAvg}) -> + io_lib:format("Current average donor node disconnected session count: ~p~n", [DonorSessAvg]); +format_local_status_field({stats, Stats}) -> + format_local_stats(Stats). + +format_local_stats(Stats) -> + [ + "Channel statistics:\n" + | lists:map( + fun({Name, Value}) -> + io_lib:format(" ~p: ~p~n", [Name, Value]) + end, + maps:to_list(Stats) + ) + ]. + +evacuation_status() -> + {node(), emqx_node_rebalance_evacuation:status()}. + +rebalance_status() -> + {node(), emqx_node_rebalance:status()}. diff --git a/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl new file mode 100644 index 000000000..cfaccc4c2 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/emqx_node_rebalance_sup.erl @@ -0,0 +1,35 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_sup). + +-behaviour(supervisor). + +-export([start_link/0]). + +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + Childs = [ + child_spec(emqx_node_rebalance_evacuation, []), + child_spec(emqx_node_rebalance_agent, []), + child_spec(emqx_node_rebalance, []) + ], + {ok, { + #{strategy => one_for_one, intensity => 10, period => 3600}, + Childs + }}. + +child_spec(Mod, Args) -> + #{ + id => Mod, + start => {Mod, start_link, Args}, + restart => permanent, + shutdown => 5000, + type => worker, + modules => [Mod] + }. diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl new file mode 100644 index 000000000..131973932 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_api_proto_v1.erl @@ -0,0 +1,43 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + node_rebalance_evacuation_start/2, + node_rebalance_evacuation_stop/1, + + node_rebalance_start/2, + node_rebalance_stop/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.0.22". + +-spec node_rebalance_evacuation_start(node(), emqx_node_rebalance_evacuation:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance_evacuation:start_error()). +node_rebalance_evacuation_start(Node, #{} = Opts) -> + rpc:call(Node, emqx_node_rebalance_evacuation, start, [Opts]). + +-spec node_rebalance_evacuation_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(not_started). +node_rebalance_evacuation_stop(Node) -> + rpc:call(Node, emqx_node_rebalance_evacuation, stop, []). + +-spec node_rebalance_start(node(), emqx_node_rebalance:start_opts()) -> + emqx_rpc:badrpc() | ok_or_error(emqx_node_rebalance:start_error()). +node_rebalance_start(Node, Opts) -> + rpc:call(Node, emqx_node_rebalance, start, [Opts]). + +-spec node_rebalance_stop(node()) -> + emqx_rpc:badrpc() | ok_or_error(not_started). +node_rebalance_stop(Node) -> + rpc:call(Node, emqx_node_rebalance, stop, []). diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl new file mode 100644 index 000000000..f5a6e1077 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_evacuation_proto_v1.erl @@ -0,0 +1,22 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + available_nodes/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). + +introduced_in() -> + "5.0.22". + +-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()). +available_nodes(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_evacuation, is_node_available, []). diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl new file mode 100644 index 000000000..98625d4fd --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_proto_v1.erl @@ -0,0 +1,62 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + available_nodes/1, + evict_connections/2, + evict_sessions/4, + connection_counts/1, + session_counts/1, + enable_rebalance_agent/2, + disable_rebalance_agent/2, + disconnected_session_counts/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.0.22". + +-spec available_nodes([node()]) -> emqx_rpc:multicall_result(node()). +available_nodes(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, is_node_available, []). + +-spec evict_connections([node()], non_neg_integer()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +evict_connections(Nodes, Count) -> + rpc:multicall(Nodes, emqx_eviction_agent, evict_connections, [Count]). + +-spec evict_sessions([node()], non_neg_integer(), [node()], emqx_channel:conn_state()) -> + emqx_rpc:multicall_result(ok_or_error(disabled)). +evict_sessions(Nodes, Count, RecipientNodes, ConnState) -> + rpc:multicall(Nodes, emqx_eviction_agent, evict_sessions, [Count, RecipientNodes, ConnState]). + +-spec connection_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +connection_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, connection_count, []). + +-spec session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +session_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, session_count, []). + +-spec enable_rebalance_agent([node()], pid()) -> + emqx_rpc:multicall_result(ok_or_error(already_enabled | eviction_agent_busy)). +enable_rebalance_agent(Nodes, OwnerPid) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, enable, [OwnerPid]). + +-spec disable_rebalance_agent([node()], pid()) -> + emqx_rpc:multicall_result(ok_or_error(already_disabled | invalid_coordinator)). +disable_rebalance_agent(Nodes, OwnerPid) -> + rpc:multicall(Nodes, emqx_node_rebalance_agent, disable, [OwnerPid]). + +-spec disconnected_session_counts([node()]) -> emqx_rpc:multicall_result({ok, non_neg_integer()}). +disconnected_session_counts(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance, disconnected_session_count, []). diff --git a/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl new file mode 100644 index 000000000..e3e4a423c --- /dev/null +++ b/lib-ee/emqx_node_rebalance/src/proto/emqx_node_rebalance_status_proto_v1.erl @@ -0,0 +1,36 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_status_proto_v1). + +-behaviour(emqx_bpapi). + +-export([ + introduced_in/0, + + local_status/1, + rebalance_status/1, + evacuation_status/1 +]). + +-include_lib("emqx/include/bpapi.hrl"). +-include_lib("emqx/include/types.hrl"). + +introduced_in() -> + "5.0.22". + +-spec local_status(node()) -> + emqx_rpc:badrpc() | disabled | {evacuation, map()} | {rebalance, map()}. +local_status(Node) -> + rpc:call(Node, emqx_node_rebalance_status, local_status, []). + +-spec rebalance_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +rebalance_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, rebalance_status, []). + +-spec evacuation_status([node()]) -> + emqx_rpc:multicall_result({node(), map()}). +evacuation_status(Nodes) -> + rpc:multicall(Nodes, emqx_node_rebalance_status, evacuation_status, []). diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl new file mode 100644 index 000000000..a818145a2 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_SUITE.erl @@ -0,0 +1,229 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect_many/1, emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] +). + +-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([]), + ok. + +init_per_testcase(Case, Config) -> + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [ + {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ], + ?START_APPS + ), + ok = snabbkaffe:start_trace(), + [{cluster_nodes, ClusterNodes} | Config]. + +end_per_testcase(_Case, Config) -> + ok = snabbkaffe:stop(), + ok = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + ?START_APPS + ). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Conns = emqtt_connect_many(DonorPort, 500), + + Opts = #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?assertWaitEvent( + ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]), + #{?snk_kind := emqx_node_rebalance_evict_sess_over}, + 10000 + ), + + DonorConnCount = rpc:call(DonorNode, emqx_eviction_agent, connection_count, []), + DonorSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, []), + DonorDSessCount = rpc:call(DonorNode, emqx_eviction_agent, session_count, [disconnected]), + + RecipientConnCount = rpc:call(RecipientNode, emqx_eviction_agent, connection_count, []), + RecipientSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, []), + RecipientDSessCount = rpc:call(RecipientNode, emqx_eviction_agent, session_count, [disconnected]), + + ct:pal( + "Donor: conn=~p, sess=~p, dsess=~p", + [DonorConnCount, DonorSessCount, DonorDSessCount] + ), + ct:pal( + "Recipient: conn=~p, sess=~p, dsess=~p", + [RecipientConnCount, RecipientSessCount, RecipientDSessCount] + ), + + ?assert(DonorConnCount - 50 =< RecipientConnCount), + ?assert(DonorDSessCount - 50 =< RecipientDSessCount), + + ok = stop_many(Conns). + +t_rebalance_node_crash(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Conns = emqtt_connect_many(DonorPort, 500), + + Opts = #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?assertWaitEvent( + begin + ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]), + emqx_common_test_helpers:stop_slave(RecipientNode) + end, + #{?snk_kind := emqx_node_rebalance_started}, + 1000 + ), + + ?assertEqual( + disabled, + rpc:call(DonorNode, emqx_node_rebalance, status, []) + ), + + ok = stop_many(Conns). + +t_no_need_to_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Opts = #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + evict_interval => 10, + abs_conn_threshold => 50, + abs_sess_threshold => 50, + rel_conn_threshold => 1.0, + rel_sess_threshold => 1.0, + wait_health_check => 0.01, + wait_takeover => 0.01, + nodes => Nodes + }, + + ?assertEqual( + {error, nothing_to_balance}, + rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]) + ), + + Conns = emqtt_connect_many(DonorPort, 50), + + ?assertEqual( + {error, nothing_to_balance}, + rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]) + ), + + ok = stop_many(Conns). + +t_unknown_mesages(Config) -> + process_flag(trap_exit, true), + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + Nodes = [DonorNode, RecipientNode], + + Conns = emqtt_connect_many(DonorPort, 500), + + Opts = #{ + wait_health_check => 100, + abs_conn_threshold => 50, + nodes => Nodes + }, + + Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance]), + + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ), + + ok = rpc:call(DonorNode, emqx_node_rebalance, start, [Opts]), + + Pid ! unknown, + ok = gen_server:cast(Pid, unknown), + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ), + + ok = stop_many(Conns). + +t_available_nodes(Config) -> + [{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + %% Start eviction agent on RecipientNode so that it will be "occupied" + %% and not available for rebalance + ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]), + + %% Only DonorNode should be is available for rebalance, since RecipientNode is "occupied" + ?assertEqual( + [DonorNode], + rpc:call( + DonorNode, + emqx_node_rebalance, + available_nodes, + [[DonorNode, RecipientNode]] + ) + ). diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl new file mode 100644 index 000000000..8b21f9433 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_agent_SUITE.erl @@ -0,0 +1,214 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_agent_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx.hrl"). +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [case_specific_node_name/2] +). + +all() -> + [ + {group, local}, + {group, cluster} + ]. + +groups() -> + [ + {local, [], [ + t_enable_disable, + t_enable_egent_busy, + t_unknown_messages + ]}, + {cluster, [], [ + t_rebalance_agent_coordinator_fail, + t_rebalance_agent_fail + ]} + ]. + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([emqx_eviction_agent, emqx_node_rebalance]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([emqx_eviction_agent, emqx_node_rebalance]), + ok. + +init_per_group(local, Config) -> + [{cluster, false} | Config]; +init_per_group(cluster, Config) -> + [{cluster, true} | Config]. + +end_per_group(_Group, _Config) -> + ok. + +init_per_testcase(Case, Config) -> + case ?config(cluster, Config) of + true -> + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [{case_specific_node_name(?MODULE, Case), 2883}], + [emqx_eviction_agent, emqx_node_rebalance] + ), + [{cluster_nodes, ClusterNodes} | Config]; + false -> + Config + end. + +end_per_testcase(_Case, Config) -> + case ?config(cluster, Config) of + true -> + emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + [emqx_eviction_agent, emqx_node_rebalance] + ); + false -> + ok + end. + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +%% Local tests + +t_enable_disable(_Config) -> + ?assertEqual( + disabled, + emqx_node_rebalance_agent:status() + ), + + ?assertEqual( + ok, + emqx_node_rebalance_agent:enable(self()) + ), + + ?assertEqual( + {error, already_enabled}, + emqx_node_rebalance_agent:enable(self()) + ), + + ?assertEqual( + {enabled, self()}, + emqx_node_rebalance_agent:status() + ), + + ?assertEqual( + {error, invalid_coordinator}, + emqx_node_rebalance_agent:disable(spawn_link(fun() -> ok end)) + ), + + ?assertEqual( + ok, + emqx_node_rebalance_agent:disable(self()) + ), + + ?assertEqual( + {error, already_disabled}, + emqx_node_rebalance_agent:disable(self()) + ), + + ?assertEqual( + disabled, + emqx_node_rebalance_agent:status() + ). + +t_enable_egent_busy(_Config) -> + ok = emqx_eviction_agent:enable(rebalance_test, undefined), + + ?assertEqual( + {error, eviction_agent_busy}, + emqx_node_rebalance_agent:enable(self()) + ), + + ok = emqx_eviction_agent:disable(rebalance_test). + +t_unknown_messages(_Config) -> + Pid = whereis(emqx_node_rebalance_agent), + + ok = gen_server:cast(Pid, unknown), + + Pid ! unknown, + + ignored = gen_server:call(Pid, unknown). + +%% Cluster tests + +% The following tests verify that emqx_node_rebalance_agent correctly links +% coordinator process with emqx_eviction_agent-s. + +t_rebalance_agent_coordinator_fail(Config) -> + process_flag(trap_exit, true), + + [{Node, _}] = ?config(cluster_nodes, Config), + + CoordinatorPid = spawn_link( + fun() -> + receive + done -> ok + end + end + ), + + ?assertEqual( + disabled, + rpc:call(Node, emqx_eviction_agent, status, []) + ), + + ?assertEqual( + ok, + rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid]) + ), + + ?assertMatch( + {enabled, _}, + rpc:call(Node, emqx_eviction_agent, status, []) + ), + + EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]), + true = link(EvictionAgentPid), + + true = exit(CoordinatorPid, kill), + + receive + {'EXIT', EvictionAgentPid, _} -> true + after 1000 -> + ct:fail("emqx_eviction_agent did not exit") + end. + +t_rebalance_agent_fail(Config) -> + process_flag(trap_exit, true), + + [{Node, _}] = ?config(cluster_nodes, Config), + + CoordinatorPid = spawn_link( + fun() -> + receive + done -> ok + end + end + ), + + ?assertEqual( + ok, + rpc:call(Node, emqx_node_rebalance_agent, enable, [CoordinatorPid]) + ), + + EvictionAgentPid = rpc:call(Node, erlang, whereis, [emqx_eviction_agent]), + true = exit(EvictionAgentPid, kill), + + receive + {'EXIT', CoordinatorPid, _} -> true + after 1000 -> + ct:fail("emqx_node_rebalance_agent did not exit") + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl new file mode 100644 index 000000000..21608b8bc --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_api_SUITE.erl @@ -0,0 +1,444 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_api_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import( + emqx_mgmt_api_test_util, + [ + request/2, + request/3, + uri/1 + ] +). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] +). + +-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps(?START_APPS), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps(?START_APPS), + ok. + +init_per_testcase(Case, Config) -> + [{DonorNode, _} | _] = + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [ + {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ], + ?START_APPS, + [{emqx, data_dir, case_specific_data_dir(Case, Config)}] + ), + + ok = rpc:call(DonorNode, emqx_mgmt_api_test_util, init_suite, []), + ok = take_auth_header_from(DonorNode), + + [{cluster_nodes, ClusterNodes} | Config]. +end_per_testcase(_Case, Config) -> + _ = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + ?START_APPS + ). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_start_evacuation_validation(Config) -> + [{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + BadOpts = [ + #{conn_evict_rate => <<"conn">>}, + #{sess_evict_rate => <<"sess">>}, + #{redirect_to => 123}, + #{wait_takeover => <<"wait">>}, + #{migrate_to => []}, + #{migrate_to => <<"migrate_to">>}, + #{migrate_to => [<<"bad_node">>]}, + #{migrate_to => [<<"bad_node">>, atom_to_binary(DonorNode)]}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"], + Opts + ) + ) + end, + BadOpts + ), + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", "bad@node", "evacuation", "start"], + #{} + ) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"], + #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 10, + redirect_to => <<"srv">>, + migrate_to => [atom_to_binary(RecipientNode)] + } + ) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + ?assertMatch( + {ok, 200, #{<<"evacuations">> := [#{<<"node">> := DonorNodeBin}]}}, + api_get(["load_rebalance", "global_status"]) + ). + +t_start_rebalance_validation(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + BadOpts = [ + #{conn_evict_rate => <<"conn">>}, + #{sess_evict_rate => <<"sess">>}, + #{abs_conn_threshold => <<"act">>}, + #{rel_conn_threshold => <<"rct">>}, + #{abs_sess_threshold => <<"act">>}, + #{rel_sess_threshold => <<"rct">>}, + #{wait_takeover => <<"wait">>}, + #{wait_health_check => <<"wait">>}, + #{nodes => <<"nodes">>}, + #{nodes => []}, + #{nodes => [<<"bad_node">>]}, + #{nodes => [<<"bad_node">>, atom_to_binary(DonorNode)]}, + #{unknown => <<"Value">>} + ], + lists:foreach( + fun(Opts) -> + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "start"], + Opts + ) + ) + end, + BadOpts + ), + ?assertMatch( + {ok, 400, #{}}, + api_post( + ["load_rebalance", "bad@node", "start"], + #{} + ) + ), + + Conns = emqtt_connect_many(DonorPort, 50), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "start"], + #{ + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 10, + wait_health_check => 10, + abs_conn_threshold => 10, + rel_conn_threshold => 1.001, + abs_sess_threshold => 10, + rel_sess_threshold => 1.001, + nodes => [ + atom_to_binary(DonorNode), + atom_to_binary(RecipientNode) + ] + } + ) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + ?assertMatch( + {ok, 200, #{<<"rebalances">> := [#{<<"node">> := DonorNodeBin}]}}, + api_get(["load_rebalance", "global_status"]) + ), + + ok = stop_many(Conns). + +t_start_stop_evacuation(Config) -> + [{DonorNode, _}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + StartOpts = maps:merge( + emqx_node_rebalance_api:rebalance_evacuation_example(), + #{migrate_to => [atom_to_binary(RecipientNode)]} + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "start"], + StartOpts + ) + ), + + StatusResponse = api_get(["load_rebalance", "status"]), + + ?assertMatch( + {ok, 200, _}, + StatusResponse + ), + + {ok, 200, Status} = StatusResponse, + + ?assertMatch( + #{ + process := evacuation, + connection_eviction_rate := 100, + session_eviction_rate := 100, + connection_goal := 0, + session_goal := 0, + stats := #{ + initial_connected := _, + current_connected := _, + initial_sessions := _, + current_sessions := _ + } + }, + emqx_node_rebalance_api:translate(local_status_enabled, Status) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + + GlobalStatusResponse = api_get(["load_rebalance", "global_status"]), + + ?assertMatch( + {ok, 200, _}, + GlobalStatusResponse + ), + + {ok, 200, GlobalStatus} = GlobalStatusResponse, + + ?assertMatch( + #{ + rebalances := [], + evacuations := [ + #{ + node := DonorNodeBin, + connection_eviction_rate := 100, + session_eviction_rate := 100, + connection_goal := 0, + session_goal := 0, + stats := #{ + initial_connected := _, + current_connected := _, + initial_sessions := _, + current_sessions := _ + } + } + ] + }, + emqx_node_rebalance_api:translate(global_status, GlobalStatus) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "evacuation", "stop"], + #{} + ) + ), + + ?assertMatch( + {ok, 200, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"]) + ), + + ?assertMatch( + {ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}}, + api_get(["load_rebalance", "global_status"]) + ). + +t_start_stop_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + ?assertMatch( + {ok, 200, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"]) + ), + + Conns = emqtt_connect_many(DonorPort, 100), + + StartOpts = maps:without( + [nodes], + emqx_node_rebalance_api:rebalance_example() + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "start"], + StartOpts + ) + ), + + StatusResponse = api_get(["load_rebalance", "status"]), + + ?assertMatch( + {ok, 200, _}, + StatusResponse + ), + + {ok, 200, Status} = StatusResponse, + + ?assertMatch( + #{process := rebalance, connection_eviction_rate := 10, session_eviction_rate := 20}, + emqx_node_rebalance_api:translate(local_status_enabled, Status) + ), + + DonorNodeBin = atom_to_binary(DonorNode), + RecipientNodeBin = atom_to_binary(RecipientNode), + + GlobalStatusResponse = api_get(["load_rebalance", "global_status"]), + + ?assertMatch( + {ok, 200, _}, + GlobalStatusResponse + ), + + {ok, 200, GlobalStatus} = GlobalStatusResponse, + + ?assertMatch( + {ok, 200, #{ + <<"evacuations">> := [], + <<"rebalances">> := + [ + #{ + <<"state">> := _, + <<"node">> := DonorNodeBin, + <<"coordinator_node">> := _, + <<"connection_eviction_rate">> := 10, + <<"session_eviction_rate">> := 20, + <<"donors">> := [DonorNodeBin], + <<"recipients">> := [RecipientNodeBin] + } + ] + }}, + api_get(["load_rebalance", "global_status"]) + ), + + ?assertMatch( + #{ + evacuations := [], + rebalances := [ + #{ + state := _, + node := DonorNodeBin, + coordinator_node := _, + connection_eviction_rate := 10, + session_eviction_rate := 20, + donors := [DonorNodeBin], + recipients := [RecipientNodeBin] + } + ] + }, + emqx_node_rebalance_api:translate(global_status, GlobalStatus) + ), + + ?assertMatch( + {ok, 200, #{}}, + api_post( + ["load_rebalance", atom_to_list(DonorNode), "stop"], + #{} + ) + ), + + ?assertMatch( + {ok, 200, #{<<"status">> := <<"disabled">>}}, + api_get(["load_rebalance", "status"]) + ), + + ?assertMatch( + {ok, 200, #{<<"evacuations">> := [], <<"rebalances">> := []}}, + api_get(["load_rebalance", "global_status"]) + ), + + ok = stop_many(Conns). + +t_availability_check(Config) -> + [{DonorNode, _} | _] = ?config(cluster_nodes, Config), + ?assertMatch( + {ok, 200, #{}}, + api_get(["load_rebalance", "availability_check"]) + ), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [#{}]), + + ?assertMatch( + {ok, 503, _}, + api_get(["load_rebalance", "availability_check"]) + ), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, []), + + ?assertMatch( + {ok, 200, #{}}, + api_get(["load_rebalance", "availability_check"]) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +api_get(Path) -> + case request(get, uri(Path)) of + {ok, Code, ResponseBody} -> + {ok, Code, jiffy:decode(ResponseBody, [return_maps])}; + {error, _} = Error -> + Error + end. + +api_post(Path, Data) -> + case request(post, uri(Path), Data) of + {ok, Code, ResponseBody} -> + {ok, Code, jiffy:decode(ResponseBody, [return_maps])}; + {error, _} = Error -> + Error + end. + +take_auth_header_from(Node) -> + meck:new(emqx_common_test_http, [passthrough]), + meck:expect( + emqx_common_test_http, + default_auth_header, + fun() -> rpc:call(Node, emqx_common_test_http, default_auth_header, []) end + ), + ok. + +case_specific_data_dir(Case, Config) -> + case ?config(priv_dir, Config) of + undefined -> undefined; + PrivDir -> filename:join(PrivDir, atom_to_list(Case)) + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl new file mode 100644 index 000000000..54ecad026 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_cli_SUITE.erl @@ -0,0 +1,291 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_cli_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect_many/2, stop_many/1, case_specific_node_name/3] +). + +-define(START_APPS, [emqx_eviction_agent, emqx_node_rebalance]). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + emqx_common_test_helpers:start_apps(?START_APPS), + Config. + +end_per_suite(Config) -> + emqx_common_test_helpers:stop_apps(lists:reverse(?START_APPS)), + Config. + +init_per_testcase(Case = t_rebalance, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + [ + {case_specific_node_name(?MODULE, Case, '_donor'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ], + ?START_APPS + ), + [{cluster_nodes, ClusterNodes} | Config]; +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(), + Config. + +end_per_testcase(t_rebalance, Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(), + _ = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + ?START_APPS + ); +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance_evacuation:stop(), + _ = emqx_node_rebalance:stop(). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_evacuation(_Config) -> + %% usage + ok = emqx_node_rebalance_cli:cli(["foobar"]), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--foo-bar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--conn-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--sess-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli(["start", "--evacuation", "--wait-takeover", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--migrate-to", + "nonexistent@node" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--migrate-to", + "" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--unknown-arg" + ]) + ), + ?assert( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--conn-evict-rate", + "10", + "--sess-evict-rate", + "10", + "--wait-takeover", + "10", + "--migrate-to", + atom_to_list(node()), + "--redirect-to", + "srv" + ]) + ), + + %% status + ok = emqx_node_rebalance_cli:cli(["status"]), + ok = emqx_node_rebalance_cli:cli(["node-status"]), + ok = emqx_node_rebalance_cli:cli(["node-status", atom_to_list(node())]), + + ?assertMatch( + {enabled, #{}}, + emqx_node_rebalance_evacuation:status() + ), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli:cli([ + "start", + "--evacuation", + "--conn-evict-rate", + "10", + "--redirect-to", + "srv" + ]) + ), + + %% stop + true = emqx_node_rebalance_cli:cli(["stop"]), + + false = emqx_node_rebalance_cli:cli(["stop"]), + + ?assertEqual( + disabled, + emqx_node_rebalance_evacuation:status() + ). + +t_rebalance(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _}] = ?config(cluster_nodes, Config), + + %% start with invalid args + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--foo-bar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--conn-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--abs-conn-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--rel-conn-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--sess-evict-rate", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--abs-sess-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--rel-sess-threshold", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--wait-takeover", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start", "--wait-health-check", "foobar"]) + ), + + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--nodes", + "nonexistent@node" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--nodes", + "" + ]) + ), + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--nodes", + atom_to_list(RecipientNode) + ]) + ), + ?assertNot( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--unknown-arg" + ]) + ), + + Conns = emqtt_connect_many(DonorPort, 20), + + ?assert( + emqx_node_rebalance_cli(DonorNode, [ + "start", + "--conn-evict-rate", + "10", + "--abs-conn-threshold", + "10", + "--rel-conn-threshold", + "1.1", + "--sess-evict-rate", + "10", + "--abs-sess-threshold", + "10", + "--rel-sess-threshold", + "1.1", + "--wait-takeover", + "10", + "--nodes", + atom_to_list(DonorNode) ++ "," ++ + atom_to_list(RecipientNode) + ]) + ), + + %% status + ok = emqx_node_rebalance_cli(DonorNode, ["status"]), + ok = emqx_node_rebalance_cli(DonorNode, ["node-status"]), + ok = emqx_node_rebalance_cli(DonorNode, ["node-status", atom_to_list(DonorNode)]), + + ?assertMatch( + {enabled, #{}}, + rpc:call(DonorNode, emqx_node_rebalance, status, []) + ), + + %% already enabled + ?assertNot( + emqx_node_rebalance_cli(DonorNode, ["start"]) + ), + + %% stop + true = emqx_node_rebalance_cli(DonorNode, ["stop"]), + + false = emqx_node_rebalance_cli(DonorNode, ["stop"]), + + ?assertEqual( + disabled, + rpc:call(DonorNode, emqx_node_rebalance, status, []) + ), + + ok = stop_many(Conns). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +emqx_node_rebalance_cli(Node, Args) -> + case rpc:call(Node, emqx_node_rebalance_cli, cli, [Args]) of + {badrpc, Reason} -> + error(Reason); + Result -> + Result + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl new file mode 100644 index 000000000..cdafad97a --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_SUITE.erl @@ -0,0 +1,271 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("emqx/include/emqx_mqtt.hrl"). +-include_lib("emqx/include/asserts.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). +-include_lib("snabbkaffe/include/snabbkaffe.hrl"). + +-import( + emqx_eviction_agent_test_helpers, + [emqtt_connect/1, emqtt_try_connect/1, case_specific_node_name/3] +). + +all() -> [{group, one_node}, {group, two_node}]. + +groups() -> + [ + {one_node, [], [ + t_agent_busy, + t_already_started, + t_not_started, + t_start, + t_persistence, + t_unknown_messages + ]}, + {two_node, [], [ + t_conn_evicted, + t_migrate_to, + t_session_evicted + ]} + ]. + +init_per_suite(Config) -> + ok = emqx_common_test_helpers:start_apps([]), + Config. + +end_per_suite(_Config) -> + ok = emqx_common_test_helpers:stop_apps([]), + ok. + +init_per_group(one_node, Config) -> + [{cluster_type, one_node} | Config]; +init_per_group(two_node, Config) -> + [{cluster_type, two_node} | Config]. + +end_per_group(_Group, _Config) -> + ok. + +init_per_testcase(Case, Config) -> + NodesWithPorts = + case ?config(cluster_type, Config) of + one_node -> + [{case_specific_node_name(?MODULE, Case, '_evacuated'), 2883}]; + two_node -> + [ + {case_specific_node_name(?MODULE, Case, '_evacuated'), 2883}, + {case_specific_node_name(?MODULE, Case, '_recipient'), 3883} + ] + end, + ClusterNodes = emqx_eviction_agent_test_helpers:start_cluster( + NodesWithPorts, + [emqx_eviction_agent, emqx_node_rebalance], + [{emqx, data_dir, case_specific_data_dir(Case, Config)}] + ), + ok = snabbkaffe:start_trace(), + [{cluster_nodes, ClusterNodes} | Config]. + +end_per_testcase(_Case, Config) -> + ok = snabbkaffe:stop(), + ok = emqx_eviction_agent_test_helpers:stop_cluster( + ?config(cluster_nodes, Config), + [emqx_eviction_agent, emqx_node_rebalance] + ). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +%% One node tests + +t_agent_busy(Config) -> + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + ok = rpc:call(DonorNode, emqx_eviction_agent, enable, [other_rebalance, undefined]), + + ?assertEqual( + {error, eviction_agent_busy}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]) + ). + +t_already_started(Config) -> + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + + ?assertEqual( + {error, already_started}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]) + ). + +t_not_started(Config) -> + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + + ?assertEqual( + {error, not_started}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, stop, []) + ). + +t_start(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}] = ?config(cluster_nodes, Config), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, DonorPort}]) + ). + +t_persistence(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}] = ?config(cluster_nodes, Config), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, DonorPort}]) + ), + + ok = rpc:call(DonorNode, supervisor, terminate_child, [ + emqx_node_rebalance_sup, emqx_node_rebalance_evacuation + ]), + {ok, _} = rpc:call(DonorNode, supervisor, restart_child, [ + emqx_node_rebalance_sup, emqx_node_rebalance_evacuation + ]), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{port, DonorPort}]) + ), + ?assertMatch( + {enabled, #{conn_evict_rate := 10}}, + rpc:call(DonorNode, emqx_node_rebalance_evacuation, status, []) + ). + +t_unknown_messages(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, _DonorPort}] = ?config(cluster_nodes, Config), + + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + + Pid = rpc:call(DonorNode, erlang, whereis, [emqx_node_rebalance_evacuation]), + + Pid ! unknown, + + ok = gen_server:cast(Pid, unknown), + + ?assertEqual( + ignored, + gen_server:call(Pid, unknown) + ). + +%% Two node tests + +t_conn_evicted(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, _] = ?config(cluster_nodes, Config), + + {ok, C} = emqtt_connect([{clientid, <<"evacuated">>}, {port, DonorPort}]), + + ?assertWaitEvent( + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + #{?snk_kind := node_evacuation_evict_conn}, + 1000 + ), + + ?assertMatch( + {error, {use_another_server, #{}}}, + emqtt_try_connect([{clientid, <<"connecting">>}, {port, DonorPort}]) + ), + + receive + {'EXIT', C, {disconnected, 156, _}} -> ok + after 1000 -> + ct:fail("Connection not evicted") + end. + +t_migrate_to(Config) -> + [{DonorNode, _DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + ?assertEqual( + [RecipientNode], + rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined]) + ), + + ?assertEqual( + [], + rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [['unknown@node']]) + ), + + ok = rpc:call(RecipientNode, emqx_eviction_agent, enable, [test_rebalance, undefined]), + + ?assertEqual( + [], + rpc:call(DonorNode, emqx_node_rebalance_evacuation, migrate_to, [undefined]) + ). + +t_session_evicted(Config) -> + process_flag(trap_exit, true), + + [{DonorNode, DonorPort}, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + + {ok, C} = emqtt_connect([ + {port, DonorPort}, {clientid, <<"client_with_sess">>}, {clean_start, false} + ]), + + ?assertWaitEvent( + ok = rpc:call(DonorNode, emqx_node_rebalance_evacuation, start, [opts(Config)]), + #{?snk_kind := node_evacuation_evict_sess_over}, + 5000 + ), + + receive + {'EXIT', C, {disconnected, ?RC_USE_ANOTHER_SERVER, _}} -> ok + after 1000 -> + ct:fail("Connection not evicted") + end, + + [ChannelPid] = rpc:call(DonorNode, emqx_cm_registry, lookup_channels, [<<"client_with_sess">>]), + + ?assertEqual( + RecipientNode, + node(ChannelPid) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +opts(Config) -> + #{ + server_reference => <<"srv">>, + conn_evict_rate => 10, + sess_evict_rate => 10, + wait_takeover => 1, + migrate_to => migrate_to(Config) + }. + +migrate_to(Config) -> + case ?config(cluster_type, Config) of + one_node -> + []; + two_node -> + [_, {RecipientNode, _RecipientPort}] = ?config(cluster_nodes, Config), + [RecipientNode] + end. + +case_specific_data_dir(Case, Config) -> + case ?config(priv_dir, Config) of + undefined -> undefined; + PrivDir -> filename:join(PrivDir, atom_to_list(Case)) + end. diff --git a/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl new file mode 100644 index 000000000..450280cb8 --- /dev/null +++ b/lib-ee/emqx_node_rebalance/test/emqx_node_rebalance_evacuation_persist_SUITE.erl @@ -0,0 +1,108 @@ +%%-------------------------------------------------------------------- +%% Copyright (c) 2022-2023 EMQ Technologies Co., Ltd. All Rights Reserved. +%%-------------------------------------------------------------------- + +-module(emqx_node_rebalance_evacuation_persist_SUITE). + +-compile(export_all). +-compile(nowarn_export_all). + +-include_lib("eunit/include/eunit.hrl"). +-include_lib("common_test/include/ct.hrl"). + +all() -> + emqx_common_test_helpers:all(?MODULE). + +init_per_suite(Config) -> + Config. + +end_per_suite(_Config) -> + ok. + +init_per_testcase(_Case, Config) -> + _ = emqx_node_rebalance_evacuation_persist:clear(), + Config. + +end_per_testcase(_Case, _Config) -> + _ = emqx_node_rebalance_evacuation_persist:clear(). + +%%-------------------------------------------------------------------- +%% Tests +%%-------------------------------------------------------------------- + +t_save_read(_Config) -> + DefaultOpts = #{ + server_reference => <<"default_ref">>, + conn_evict_rate => 2001, + sess_evict_rate => 2002, + wait_takeover => 2003 + }, + + Opts0 = #{ + server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + ok = emqx_node_rebalance_evacuation_persist:save(Opts0), + + {ok, ReadOpts0} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(Opts0, ReadOpts0), + + Opts1 = Opts0#{server_reference => undefined}, + ok = emqx_node_rebalance_evacuation_persist:save(Opts1), + + {ok, ReadOpts1} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(Opts1, ReadOpts1). + +t_read_default(_Config) -> + ok = write_evacuation_file(<<"{}">>), + + DefaultOpts = #{ + server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + + {ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(DefaultOpts, ReadOpts). + +t_read_bad_data(_Config) -> + ok = write_evacuation_file(<<"{bad json">>), + + DefaultOpts = #{ + server_reference => <<"ref">>, + conn_evict_rate => 1001, + sess_evict_rate => 1002, + wait_takeover => 1003 + }, + + {ok, ReadOpts} = emqx_node_rebalance_evacuation_persist:read(DefaultOpts), + ?assertEqual(DefaultOpts, ReadOpts). + +t_clear(_Config) -> + ok = write_evacuation_file(<<"{}">>), + + ?assertMatch( + {ok, _}, + emqx_node_rebalance_evacuation_persist:read(#{}) + ), + + ok = emqx_node_rebalance_evacuation_persist:clear(), + + ?assertEqual( + none, + emqx_node_rebalance_evacuation_persist:read(#{}) + ). + +%%-------------------------------------------------------------------- +%% Helpers +%%-------------------------------------------------------------------- + +write_evacuation_file(Json) -> + ok = filelib:ensure_dir(emqx_node_rebalance_evacuation_persist:evacuation_filepath()), + ok = file:write_file( + emqx_node_rebalance_evacuation_persist:evacuation_filepath(), + Json + ). diff --git a/mix.exs b/mix.exs index 564d81ccf..cafeec7bd 100644 --- a/mix.exs +++ b/mix.exs @@ -402,7 +402,9 @@ defmodule EMQXUmbrella.MixProject do emqx_oracle: :permanent, emqx_bridge_oracle: :permanent, emqx_bridge_rabbitmq: :permanent, - emqx_ee_schema_registry: :permanent + emqx_ee_schema_registry: :permanent, + emqx_eviction_agent: :permanent, + emqx_node_rebalance: :permanent ], else: [] ) diff --git a/rebar.config.erl b/rebar.config.erl index a48a365c9..d556b41aa 100644 --- a/rebar.config.erl +++ b/rebar.config.erl @@ -478,7 +478,9 @@ relx_apps_per_edition(ee) -> emqx_oracle, emqx_bridge_oracle, emqx_bridge_rabbitmq, - emqx_ee_schema_registry + emqx_ee_schema_registry, + emqx_eviction_agent, + emqx_node_rebalance ]; relx_apps_per_edition(ce) -> [].